diff options
Diffstat (limited to 'src/cpu')
434 files changed, 76815 insertions, 0 deletions
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp new file mode 100644 index 0000000000..b745af8229 --- /dev/null +++ b/src/cpu/CpuContext.cpp @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/CpuContext.h" + +#include "arm_compute/core/CPP/CPPTypes.h" + +#include "src/cpu/CpuQueue.h" +#include "src/cpu/CpuTensor.h" + +#include <cstdlib> +#if !defined(__APPLE__) && !defined(__OpenBSD__) +#include <malloc.h> + +#if defined(_WIN64) +#define posix_memalign _aligned_realloc +#define posix_memalign_free _aligned_free +#endif // defined(_WIN64) +#endif // !defined(__APPLE__) && !defined(__OpenBSD__) + +#ifndef BARE_METAL +#include <thread> +#endif /* BARE_METAL */ + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void *default_allocate(void *user_data, size_t size) +{ + ARM_COMPUTE_UNUSED(user_data); + return ::operator new(size); +} +void default_free(void *user_data, void *ptr) +{ + ARM_COMPUTE_UNUSED(user_data); + ::operator delete(ptr); +} +void *default_aligned_allocate(void *user_data, size_t size, size_t alignment) +{ + ARM_COMPUTE_UNUSED(user_data); + void *ptr = nullptr; +#if defined(BARE_METAL) + size_t rem = size % alignment; + size_t real_size = (rem) ? (size + alignment - rem) : size; + ptr = memalign(alignment, real_size); +#else /* defined(BARE_METAL) */ + if (posix_memalign(&ptr, alignment, size) != 0) + { + // posix_memalign returns non-zero on failures, the return values will be + // - EINVAL: wrong alignment + // - ENOMEM: insufficient memory + ARM_COMPUTE_LOG_ERROR_ACL("posix_memalign failed, the returned pointer will be invalid"); + } +#endif /* defined(BARE_METAL) */ + return ptr; +} +void default_aligned_free(void *user_data, void *ptr) +{ + ARM_COMPUTE_UNUSED(user_data); + free(ptr); +} +static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate, + &default_aligned_free, nullptr}; + +AllocatorWrapper populate_allocator(AclAllocator *external_allocator) +{ + bool is_valid = (external_allocator != nullptr); + if (is_valid) + { + is_valid = is_valid && (external_allocator->alloc != nullptr); + is_valid = is_valid && (external_allocator->free != nullptr); + is_valid = is_valid && (external_allocator->aligned_alloc != nullptr); + is_valid = is_valid && (external_allocator->aligned_free != nullptr); + } + return is_valid ? AllocatorWrapper(*external_allocator) : AllocatorWrapper(default_allocator); +} + +cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_caps) +{ + cpuinfo::CpuIsaInfo isa_caps; + + // Extract SIMD extension + isa_caps.neon = external_caps & AclCpuCapabilitiesNeon; + isa_caps.sve = external_caps & AclCpuCapabilitiesSve; + isa_caps.sve2 = external_caps & AclCpuCapabilitiesSve2; + + // Extract data-type support + isa_caps.fp16 = external_caps & AclCpuCapabilitiesFp16; + isa_caps.bf16 = external_caps & AclCpuCapabilitiesBf16; + isa_caps.svebf16 = isa_caps.bf16; + + // Extract ISA extensions + isa_caps.dot = external_caps & AclCpuCapabilitiesDot; + isa_caps.i8mm = external_caps & AclCpuCapabilitiesMmlaInt8; + isa_caps.svef32mm = external_caps & AclCpuCapabilitiesMmlaFp; + + return isa_caps; +} + +CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads) +{ + CpuCapabilities caps; + + // Populate capabilities with system information + caps.cpu_info = cpuinfo::CpuInfo::build(); + if (external_caps != AclCpuCapabilitiesAuto) + { + cpuinfo::CpuIsaInfo isa = populate_capabilities_flags(external_caps); + auto cpus = caps.cpu_info.cpus(); + + caps.cpu_info = cpuinfo::CpuInfo(isa, cpus); + } + + // Set max number of threads +#if defined(BARE_METAL) + ARM_COMPUTE_UNUSED(max_threads); + caps.max_threads = 1; +#else /* defined(BARE_METAL) */ + caps.max_threads = (max_threads > 0) ? max_threads : std::thread::hardware_concurrency(); +#endif /* defined(BARE_METAL) */ + + return caps; +} +} // namespace + +CpuContext::CpuContext(const AclContextOptions *options) + : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1)) +{ + if (options != nullptr) + { + _allocator = populate_allocator(options->allocator); + _caps = populate_capabilities(options->capabilities, options->max_compute_units); + } +} + +const CpuCapabilities &CpuContext::capabilities() const +{ + return _caps; +} + +AllocatorWrapper &CpuContext::allocator() +{ + return _allocator; +} + +ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate) +{ + CpuTensor *tensor = new CpuTensor(this, desc); + if (tensor != nullptr && allocate) + { + tensor->allocate(); + } + return tensor; +} + +IQueue *CpuContext::create_queue(const AclQueueOptions *options) +{ + return new CpuQueue(this, options); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h new file mode 100644 index 0000000000..0c8ae49f49 --- /dev/null +++ b/src/cpu/CpuContext.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_CPUCONTEXT_H +#define SRC_CPU_CPUCONTEXT_H + +#include "src/common/AllocatorWrapper.h" +#include "src/common/cpuinfo/CpuInfo.h" +#include "src/common/IContext.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Structure that encodes the CPU capabilities to be used */ +struct CpuCapabilities +{ + cpuinfo::CpuInfo cpu_info{}; + int32_t max_threads{-1}; +}; + +/** CPU context implementation class */ +class CpuContext final : public IContext +{ +public: + /** Default Constructor + * + * @param[in] options Creational options + */ + explicit CpuContext(const AclContextOptions *options); + /** Cpu Capabilities accessor + * + * @return The ISA capabilities to be used by the CPU + */ + const CpuCapabilities &capabilities() const; + /** Backing memory allocator accessor + * + * @return Allocator that allocates CPU memory + */ + AllocatorWrapper &allocator(); + + // Inherrited methods overridden + ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; + IQueue *create_queue(const AclQueueOptions *options) override; + std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) override; + +private: + AllocatorWrapper _allocator; + CpuCapabilities _caps; +}; +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CPU_CPUCONTEXT_H */ diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp new file mode 100644 index 0000000000..be781d6794 --- /dev/null +++ b/src/cpu/CpuQueue.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/CpuQueue.h" + +#include "arm_compute/runtime/Scheduler.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx) +{ + ARM_COMPUTE_UNUSED(options); +} + +arm_compute::IScheduler &CpuQueue::scheduler() +{ + return arm_compute::Scheduler::get(); +} + +StatusCode CpuQueue::finish() +{ + return StatusCode::Success; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h new file mode 100644 index 0000000000..b6a2be0e23 --- /dev/null +++ b/src/cpu/CpuQueue.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_CPUQUEUE_H +#define SRC_CPU_CPUQUEUE_H + +#include "arm_compute/runtime/IScheduler.h" + +#include "src/common/IQueue.h" + +namespace arm_compute +{ +namespace cpu +{ +/** CPU queue implementation class */ +class CpuQueue final : public IQueue +{ +public: + /** Construct a new CpuQueue object + * + * @param[in] ctx Context to be used + * @param[in] options Command queue options + */ + CpuQueue(IContext *ctx, const AclQueueOptions *options); + /** Return legacy scheduler + * + * @return arm_compute::IScheduler& + */ + arm_compute::IScheduler &scheduler(); + + // Inherited functions overridden + StatusCode finish() override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CPU_CPUQUEUE_H */ diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp new file mode 100644 index 0000000000..59082b5350 --- /dev/null +++ b/src/cpu/CpuTensor.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/CpuTensor.h" + +#include "src/common/utils/LegacySupport.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor() +{ + ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu)); + _legacy_tensor = std::make_unique<Tensor>(); + _legacy_tensor->allocator()->init(arm_compute::detail::convert_to_legacy_tensor_info(desc)); +} + +void *CpuTensor::map() +{ + ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); + + if (_legacy_tensor == nullptr) + { + ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!"); + return nullptr; + } + return _legacy_tensor->buffer(); +} + +StatusCode CpuTensor::allocate() +{ + ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); + + _legacy_tensor->allocator()->allocate(); + return StatusCode::Success; +} + +StatusCode CpuTensor::unmap() +{ + // No-op + return StatusCode::Success; +} + +StatusCode CpuTensor::import(void *handle, ImportMemoryType type) +{ + ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); + ARM_COMPUTE_UNUSED(type); + + const auto st = _legacy_tensor->allocator()->import_memory(handle); + return bool(st) ? StatusCode::Success : StatusCode::RuntimeError; +} + +arm_compute::ITensor *CpuTensor::tensor() const +{ + return _legacy_tensor.get(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h new file mode 100644 index 0000000000..89931e1f94 --- /dev/null +++ b/src/cpu/CpuTensor.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_CPUTENSOR_H +#define SRC_CPU_CPUTENSOR_H + +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/ITensorV2.h" + +namespace arm_compute +{ +namespace cpu +{ +/** CPU tensor implementation class */ +class CpuTensor final : public ITensorV2 +{ +public: + /** Construct a new Cpu Tensor object + * + * @param[in] ctx Context to be used + * @param[in] desc Tensor descriptor + */ + CpuTensor(IContext *ctx, const AclTensorDescriptor &desc); + /** Allocates tensor + * + * @return StatusCode A status code + */ + StatusCode allocate(); + + // Inherrited functions overriden + void *map() override; + StatusCode unmap() override; + arm_compute::ITensor *tensor() const override; + StatusCode import(void *handle, ImportMemoryType type) override; + +private: + std::unique_ptr<Tensor> _legacy_tensor; +}; +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CPU_CPUTENSOR_H */ diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h new file mode 100644 index 0000000000..8726bc470a --- /dev/null +++ b/src/cpu/CpuTypes.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPUTYPES +#define ARM_COMPUTE_CPUTYPES + +namespace arm_compute +{ +/* Type definitions compatible with arm_neon.h and arm_sve.h */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +typedef __fp16 float16_t; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +typedef float float32_t; +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPUTYPES */ diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h new file mode 100644 index 0000000000..bcd0cb2c70 --- /dev/null +++ b/src/cpu/ICpuKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICPUKERNEL_H +#define ARM_COMPUTE_ICPUKERNEL_H + +#include "arm_compute/core/CPP/ICPPKernel.h" + +#include "src/cpu/kernels/CpuKernelSelectionTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +enum class KernelSelectionType +{ + Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */ + Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ +}; + +template <class Derived> +class ICpuKernel : public ICPPKernel +{ +public: + /** Micro-kernel selector + * + * @param[in] selector Selection struct passed including information to help pick the appropriate micro-kernel + * @param[in] selection_type (Optional) Decides whether to get the best implementation for the given hardware or for the given build + * + * @return A matching micro-kernel else nullptr + */ + + template <typename SelectorType> + static const auto *get_implementation(const SelectorType &selector, + KernelSelectionType selection_type = KernelSelectionType::Supported) + { + using kernel_type = + typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type; + + for (const auto &uk : Derived::get_available_kernels()) + { + if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr)) + { + return &uk; + } + } + + return static_cast<kernel_type *>(nullptr); + } +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICPUKERNEL_H */ diff --git a/src/cpu/ICpuOperator.h b/src/cpu/ICpuOperator.h new file mode 100644 index 0000000000..70ab4364c7 --- /dev/null +++ b/src/cpu/ICpuOperator.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICPUOPERATOR_H +#define ARM_COMPUTE_ICPUOPERATOR_H + +#include "arm_compute/runtime/NEON/INEOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +using ICpuOperator = experimental::INEOperator; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICPUOPERATOR_H */ diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp new file mode 100644 index 0000000000..7cfa39b286 --- /dev/null +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuActivationKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/activation/list.h" + +#include <array> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = { +#ifdef ARM_COMPUTE_ENABLE_SVE + {"sve2_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && + data.cpumodel == CPUModel::A510 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::RELU; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, +#endif // ARM_COMPUTE_ENABLE_SVE +#ifdef __aarch64__ + {// Neon LUT implementantion takes precedence + "neon_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && + data.f != ActivationLayerInfo::ActivationFunction::RELU; + }, + REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, +#endif // __aarch64__ + {"sve2_qu8_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)}, + {"sve2_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)}, + {"sve2_qs16_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QSYMM16 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, + {"sve_fp16_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && + data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC; + }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)}, + {"sve_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)}, + {"sve_fp32_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)}, + {"neon_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)}, + {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)}, + {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)}, + {"neon_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)}, + {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)}, +}; + +/* Supported activation in the 8-bit integer domain */ +static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, ActivationLayerInfo::ActivationFunction::GELU, +}; +/* Supported activation in the 16-bit integer domain */ +static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = { + ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::QSYMM16, DataType::F16, DataType::F32); + + const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized_asymmetric(data_type) && + (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == + std::end(qasymm8_activations)), + "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && + (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), + f_act) == std::end(qsymm16_activations)), + "For QSYMM16 only tanh and logistic are supported"); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, -128))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + + // Checks performed when dst is configured + if ((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + if (dst != nullptr) + { + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + } + + return std::make_pair(Status{}, win); +} +#ifdef __aarch64__ +void init_lut(ActivationLayerInfo::ActivationFunction act_func, + DataType data_type, + const UniformQuantizationInfo &qi_in, + const UniformQuantizationInfo &qi_out, + ActivationLayerInfo::LookupTable256 &lut, + float a, + float b) +{ + for (size_t i = 0; i < lut.size(); ++i) + { + float tmp_f = + (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in); + switch (act_func) + { + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + break; + case ActivationLayerInfo::ActivationFunction::ABS: + tmp_f = std::abs(tmp_f); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp_f = a * tmp_f + b; + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp_f = std::min<>(a, std::max(0.f, tmp_f)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp_f = std::min<>(a, std::max<>(b, tmp_f)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp_f = (tmp_f > 12.f) ? tmp_f : std::log(1.f + std::exp(tmp_f)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp_f = (tmp_f >= 0) ? tmp_f : a * (std::exp(tmp_f) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp_f = std::sqrt(tmp_f); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp_f = tmp_f * tmp_f; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp_f = a * std::tanh(b * tmp_f); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp_f = tmp_f / (1.f + std::exp(-a * tmp_f)); + break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp_f = tmp_f * (0.5f * (1.0f + erff(tmp_f / 1.41421356237f))); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + tmp_f = 0; + break; + } + lut[i] = + (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out); + } +} +#endif // __aarch64__ +} // namespace + +void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info) +{ + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); + + const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); + if (dst != nullptr) + { + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + } + + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuActivationKernel").append("/").append(uk->name); + +#ifdef __aarch64__ + // Initialise lut_manager + LUTManager &lut_manager = LUTManager::get_instance(); + + if ((src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) && + activation_info.activation() != ActivationFunction::RELU) + { + ActivationLayerInfo::LookupTable256 tmp_lut; + init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), + (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut, + activation_info.a(), activation_info.b()); + activation_info.setLookupTable256(tmp_lut); + } + + if (src->data_type() == DataType::F16 && + activation_info.activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + const LUTInfo info = {activation_info.activation(), src->data_type(), src->quantization_info()}; + activation_info.setLookupTable65536((lut_manager.get_lut_table(info))); + } +#endif // __aarch64__ + _act_info = activation_info; + + Window win; + + // Use squashed window + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); + ICPPKernel::configure(win); +} + +Status +CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); + + return Status{}; +} + +size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + if (_split_dimension == Window::DimX) + { + // Don't split the work load too small if the tensor has been reinterpreted as 1D. + // This number is loosely chosen as threading overhead in each platform varies wildly. + return 1536; + } + return default_mws; +} + +void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + // Early exit on disabled activation + if (!_act_info.enabled()) + { + return; + } + + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, dst, _act_info, window); +} + +const char *CpuActivationKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuActivationKernel::ActivationKernel> &CpuActivationKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h new file mode 100644 index 0000000000..c1487499d6 --- /dev/null +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/core/helpers/LUTManager.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the activation kernel */ +class CpuActivationKernel : public ICpuKernel<CpuActivationKernel> +{ +private: + using ActivationKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type; + +public: + CpuActivationKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel); + /** Configure kernel for a given list of arguments + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer information. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuActivationKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requsted configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension hint. + */ + size_t get_split_dimension_hint() const + { + return _split_dimension; + } + + struct ActivationKernel + { + const char *name; + const ActivationDataTypeISASelectorDataPtr is_selected; + ActivationKernelPtr ukernel; + }; + + static const std::vector<ActivationKernel> &get_available_kernels(); + +private: + ActivationLayerInfo _act_info{}; + ActivationKernelPtr _run_method{nullptr}; + size_t _split_dimension{Window::DimY}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUACTIVATIONKERNEL_H diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp new file mode 100644 index 0000000000..a990aa4715 --- /dev/null +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuAddKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/add/list.h" + +#include <array> + +#if defined(ENABLE_FP32_KERNELS) +namespace +{ +static constexpr size_t default_mws_N1_fp32_neon = 24536; +static constexpr size_t default_mws_V1_fp32_neon = 40510; +} // namespace +#endif /* ENABLE_FP32_KERNELS */ + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuAddKernel::AddKernel> available_kernels = { + {"neon_qu8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)}, + {"neon_qs8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)}, + {"sve2_qu8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)}, + {"sve2_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)}, + {"sve2_qs16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)}, + {"sve_fp32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)}, + {"sve_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)}, + {"sve_u8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)}, + {"sve_s16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)}, + {"sve_s32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)}, + {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)}, + {"neon_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)}, + {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)}, + {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)}, + {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)}, + {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)}, + {"neon_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)}, + {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}}; + +Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (src0.tensor_shape().x() != src1.tensor_shape().x()) && + ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) || + (src1.data_type() != dst.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); + + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + + const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst); + const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>( + CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} +} // namespace + +void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); + + const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); + const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>( + CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuAddKernel").append("/").append(uk->name); + + // Auto initialize dst if not initialized + const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape()); + set_shape_if_empty(*dst, out_shape); + set_data_type_if_unknown(*dst, src0->data_type()); + + // Configure kernel window + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1); + + ICpuKernel::configure(win); +} + +Status +CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); + + return Status{}; +} + +void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, _policy, window); +} + +const char *CpuAddKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuAddKernel::AddKernel> &CpuAddKernel::get_available_kernels() +{ + return available_kernels; +} + +size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if (this->_run_method == &add_fp32_neon) + { + size_t mws = ICPPKernel::default_mws; + if (platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if (platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if (this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h new file mode 100644 index 0000000000..4adba8bb16 --- /dev/null +++ b/src/cpu/kernels/CpuAddKernel.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ADD_KERNEL_H +#define ARM_COMPUTE_CPU_ADD_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform addition between two tensors */ +class CpuAddKernel : public ICpuKernel<CpuAddKernel> +{ +private: + using AddKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + +public: + struct AddKernel + { + const char *name; + const CpuAddKernelDataTypeISASelectorDataPtr is_selected; + AddKernelPtr ukernel; + }; + + CpuAddKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel); + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] policy Overflow policy. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuAddKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + static const std::vector<AddKernel> &get_available_kernels(); + + size_t get_split_dimension() const + { + return _split_dimension; + } + +private: + ConvertPolicy _policy{}; + AddKernelPtr _run_method{nullptr}; + std::string _name{}; + size_t _split_dimension{Window::DimY}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ADD_KERNEL_H */ diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp new file mode 100644 index 0000000000..6a632e8702 --- /dev/null +++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuAddMulAddKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/addmuladd/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = { +#ifdef __aarch64__ + {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)}, + {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)}, + {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)}, + {"neon_qasymm8_signed_add_mul_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)} +#endif // __aarch64__ +}; + +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(policy != ConvertPolicy::SATURATE, "Only Saturate Policy is supported"); + + using ActFunction = ActivationLayerInfo::ActivationFunction; + const ActFunction act_func = act_info.activation(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && + act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY), + "Only RELU Family activations, or no activation, is supported"); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + + if (is_data_type_quantized(input1->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, bn_mul); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, bn_add); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], + "First dimensions of inputs and batchNorm coefs should match"); + + // Validate in case we have add layer's output (intermediate) initialized + if (add_output != nullptr && add_output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output); + } + + // Validate in case final output has been initialized + if (final_output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output); + } + + const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} +} // namespace + +void CpuAddMulAddKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2); + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + + const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + + _policy = policy; + _act_info = act_info; + _run_method = uk->ukernel; + _name = std::string("CpuAddMulAddKernel/").append(uk->name); + + // Auto initialize outputs if not initialized + set_shape_if_empty(*final_output, input1->tensor_shape()); + set_data_type_if_unknown(*final_output, input1->data_type()); + + if (add_output != nullptr) + { + set_shape_if_empty(*add_output, input1->tensor_shape()); + set_data_type_if_unknown(*add_output, input1->data_type()); + } + + // Configure kernel window + Window win; + win = calculate_max_window(*final_output, Steps()); + ICpuKernel::configure(win); +} + +Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); + + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + + return Status{}; +} + +void CpuAddMulAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); + ITensor *add_output = tensors.get_tensor(TensorType::ACL_DST_0); + ITensor *final_output = tensors.get_tensor(TensorType::ACL_DST_1); + + _run_method(input1, input2, bn_mul, bn_add, add_output, final_output, _policy, _act_info, window); +} + +const char *CpuAddMulAddKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuAddMulAddKernel::AddMulAddKernel> &CpuAddMulAddKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h new file mode 100644 index 0000000000..c5e31ec291 --- /dev/null +++ b/src/cpu/kernels/CpuAddMulAddKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_CPU_KERNELS_CPUADDMULADDKERNEL +#define SRC_CPU_KERNELS_CPUADDMULADDKERNEL + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform addition between two tensors */ +class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel> +{ +private: + using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + ITensor *, + ITensor *, + ConvertPolicy, + const ActivationLayerInfo &, + const Window &)>::type; + +public: + struct AddMulAddKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + AddMulAddKernelPtr ukernel; + }; + + CpuAddMulAddKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddMulAddKernel); + /** Initialize the kernel's inputs and outputs. + * + * Similar to @ref NEAddMulAdd::configure() + * + */ + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuAddMulAddKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + static const std::vector<AddMulAddKernel> &get_available_kernels(); + +private: + ConvertPolicy _policy{}; + ActivationLayerInfo _act_info{}; + AddMulAddKernelPtr _run_method{nullptr}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CPU_KERNELS_CPUADDMULADDKERNEL */ diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp new file mode 100644 index 0000000000..05c7742b03 --- /dev/null +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -0,0 +1,1179 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCastKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/cast/list.h" +#include "support/SaturateCast.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuCastKernel::CastKernel> available_kernels = { + {"neon_qs8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)}, + {"neon_qu8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_u8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_fp16_cast", + [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)}, + {"neon_fp32_to_fp16_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)}, + {"neon_s32_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)}, +}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON(src == dst); +#ifdef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, + DataType::F32, DataType::S32, DataType::S64, DataType::U64); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32, DataType::S64); + +#else // __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, + DataType::F32, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); +#endif // __aarch64__ + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && + dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && + (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), + "Only data_types supported [in] U16 -> [out] U8, U32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), + "Only data_types supported [in] S16 -> [out] U8, S32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), + "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), + "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::S64), + "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); +#ifdef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, + "Only data_types supported [in] S64 -> [out] F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U64 && dst->data_type() != DataType::F32, + "Only data_types supported [in] U64 -> [out] F32"); +#endif // __aarch64__ + + // Validate in case of configured dst + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*dst, src->tensor_shape()); + + _policy = policy; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICPPKernel::configure(win); +} + +Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); + return Status{}; +} +#ifdef __aarch64__ +namespace +{ +template <typename T1, typename T2> +inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) +{ + ARM_COMPUTE_UNUSED(src_ptr); + ARM_COMPUTE_UNUSED(dst_ptr); +} + +template <> +inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr) +{ + const int32x4x4_t texels = { + {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}}; + vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); + vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1]))); + vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2]))); + vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3]))); + vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3]))); +} + +template <> +inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr) +{ + const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}}; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; + vst1q_f32(dst_ptr, texels.val[0]); + vst1q_f32(dst_ptr + 4, texels.val[1]); + vst1q_f32(dst_ptr + 8, texels.val[2]); + vst1q_f32(dst_ptr + 12, texels.val[3]); +} + +template <> +inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr) +{ + const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}}; + + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; + + vst1q_f32(dst_ptr, texels.val[0]); + vst1q_f32(dst_ptr + 4, texels.val[1]); + vst1q_f32(dst_ptr + 8, texels.val[2]); + vst1q_f32(dst_ptr + 12, texels.val[3]); +} + +template <typename T1, typename T2> +inline void +convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) +{ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x); + } + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x)); + } + }, + src, dst); +} +} // namespace +#endif // __aarch64__ + +void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + + /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */ + const auto *uk = CpuCastKernel::get_implementation( + CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()}); + + switch (_src->info()->data_type()) + { +#ifdef __aarch64__ + case DataType::U64: + { + switch (_dst->info()->data_type()) + { + case DataType::F32: + { + convert64<uint64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::S64: + { + switch (_dst->info()->data_type()) + { + case DataType::F32: + { + convert64<int64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } +#endif // __aarch64__ + + case DataType::QASYMM8_SIGNED: + { + switch (_dst->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion QASYMM8_SIGNED -> S16 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion QASYMM8_SIGNED -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion QASYMM8_SIGNED -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F16: + { + /* Up-conversion QASYMM8_SIGNED -> F16 */ + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_src, _dst, info, _policy, window); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + + case DataType::QASYMM8: + case DataType::U8: + { + switch (_dst->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion U8 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F16: + { + /* Up-conversion U8 -> FP16 */ + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_src, _dst, info, _policy, window); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const uint16x8x2_t texels = { + {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}}; + + vst1q_u16(dst_ptr + x, texels.val[0]); + vst1q_u16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::S16: + { + switch (_dst->info()->data_type()) + { + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion S16 -> QASYMM8_SIGNED */ + if (ConvertPolicy::SATURATE == _policy) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; + + vst1q_s8(dst_ptr + x, + vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; + + vst1q_s8(dst_ptr + x, + vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::U8: + { + /* Down-conversion S16 -> U8 */ + if (ConvertPolicy::SATURATE == _policy) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; + + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; + + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::S32: + { + /* Up-conversion S16 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; + + const int32x4x4_t texels_s32 = { + {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])), + vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}}; + + vst1q_s32(dst_ptr + x, texels_s32.val[0]); + vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); + vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); + vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + + case DataType::U16: + { + switch (_dst->info()->data_type()) + { + case DataType::U8: + { + /* Down-conversion U16 -> U8 */ + if (ConvertPolicy::SATURATE == _policy) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; + + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; + + vst1q_u8(dst_ptr + x, + vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::U32: + { + /* Up-conversion U16 -> U32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; + + vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); + vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::F16: + { + /* conversion F16 -> any data type */ + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_src, _dst, info, _policy, window); + break; + } + case DataType::F32: + switch (_dst->info()->data_type()) + { + case DataType::F16: + { + /* Down-conversion F32 -> F16 */ + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_src, _dst, info, _policy, window); + break; + } + case DataType::S32: + { + /* Conversion F32 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = {{ + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + }}; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion F32 -> U8 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = {{ + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + }}; + + vst1_u8(dst_ptr + x, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), + vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), + vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion F32 -> QASYMM8_SIGNED */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = {{ + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + }}; + + vst1_s8(dst_ptr + x, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), + vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_s8(dst_ptr + x + 8, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), + vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + case DataType::S32: + switch (_dst->info()->data_type()) + { +#if __aarch64__ + case DataType::S64: + { + convert64<int32_t, int64_t>(src, dst, win, window_start_x, window_end_x, window_step_x); + break; + } +#endif // __aarch64__ + case DataType::F16: + { + /* Down-conversion S32 -> F16 */ + ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); + uk->ukernel(_src, _dst, info, _policy, window); + break; + } + case DataType::F32: + { + /* Conversion S32 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{ + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12), + }}; + + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion S32 -> QASYMM8_SIGNED */ + if (ConvertPolicy::SATURATE == _policy) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{ + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12), + }}; + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), + vqmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), + vqmovn_s32(texels.val[3])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), + vmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), + vmovn_s32(texels.val[3])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion S32 -> U8 */ + if (ConvertPolicy::SATURATE == _policy) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), + vqmovun_s32(texels.val[1])))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), + vqmovun_s32(texels.val[3])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_u8(dst_ptr + x, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} + +const char *CpuCastKernel::name() const +{ + return "CpuCastKernel.cpp"; +} + +const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h new file mode 100644 index 0000000000..ddbfe1f034 --- /dev/null +++ b/src/cpu/kernels/CpuCastKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Casts a given tensor to a new type + * + * @note When casting between quantized types the scale and zeroPoint are ignored + */ +class CpuCastKernel : public ICpuKernel<CpuCastKernel> +{ +private: + using CastKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type; + +public: + CpuCastKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel); + /** Set the src and dst of the kernel + * + * Valid conversions src -> dst : + * + * - QASYMM8_SIGNED -> S16, S32, F32, F16 + * - QASYMM8 -> U16, S16, S32, F32, F16 + * - U8 -> U16, S16, S32, F32, F16 + * - U16 -> U8, U32 + * - S16 -> QASYMM8_SIGNED, U8, S32 + * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 + * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 + * - S64 -> F32 + * - F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8 + * + * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/F16/F32. + * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/S64/F16/F32. + * @param[in] policy Conversion policy. + * + * @note S64 is only supported in aarch64 + * + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCastKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct CastKernel + { + const char *name; + const CastDataTypeISASelectorDataPtr is_selected; + CastKernelPtr ukernel; + }; + + static const std::vector<CastKernel> &get_available_kernels(); + +private: + ConvertPolicy _policy{ConvertPolicy::SATURATE}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUCASTKERNEL_H diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp new file mode 100644 index 0000000000..a52a1f58ea --- /dev/null +++ b/src/cpu/kernels/CpuCol2ImKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCol2ImKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims) +{ + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + // Validate configured output + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_col2im_shape(*src, convolved_dims, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuCol2ImKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims)); + + _convolved_dims = convolved_dims; + + // Configure kernel window + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, false))); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *output, const Size2D &convolved_dims) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, output, convolved_dims)); + return Status{}; +} + +void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const uint8_t el_size = src->info()->element_size(); + const int output_stride_x = dst->info()->strides_in_bytes().x(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(src, window); + Iterator out(dst, window_out); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int hidx = id.y(); + const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + + (hidx % _convolved_dims.width) * output_stride_x; + std::memcpy(out.ptr() + idx, in.ptr(), el_size); + }, + in, out); +} + +const char *CpuCol2ImKernel::name() const +{ + return "CpuCol2ImKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h new file mode 100644 index 0000000000..3e394ac914 --- /dev/null +++ b/src/cpu/kernels/CpuCol2ImKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COL2IM_KERNEL_H +#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H + +#include "arm_compute/core/Size2D.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform col2im reshaping. + * + * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CpuIm2ColKernel. + * + * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: + * + * @f[ + * \left( \begin{array}{ccccccccc} + * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccc} + * a0 & a1 & a2 \\ + * a3 & a4 & a5 \\ + * a6 & a7 & a8 \\ + * \end{array} \right) + * @f] + */ +class CpuCol2ImKernel : public ICpuKernel<CpuCol2ImKernel> +{ +public: + /** Default constructor */ + CpuCol2ImKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCol2ImKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. Data types supported: All + * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuCol2ImKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + Size2D _convolved_dims{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_COL2IM_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp new file mode 100644 index 0000000000..8c290173e8 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +template <typename T> +void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window) +{ + // Offset src + uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + + // Offset dst + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + batch_offset * dst->info()->strides_in_bytes()[3]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16 / dst->info()->element_size(); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst); + + return Status{}; +} +} // namespace + +void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); + + _func = nullptr; + _batch_offset = batch_offset; + + switch (src->data_type()) + { + case DataType::S8: + case DataType::U8: + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + _func = &batch_concat<uint8_t>; + break; + case DataType::S16: + case DataType::U16: + case DataType::F16: + _func = &batch_concat<uint16_t>; + break; + case DataType::S32: + case DataType::U32: + case DataType::F32: + _func = &batch_concat<uint32_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int batch_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst)); + return Status{}; +} + +void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset, + window); +} + +const char *CpuConcatenateBatchKernel::name() const +{ + return "CpuConcatenateBatchKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h new file mode 100644 index 0000000000..52ea553a7d --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the batch concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class CpuConcatenateBatchKernel : public ICpuKernel<CpuConcatenateBatchKernel> +{ +public: + CpuConcatenateBatchKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All. + * @param[in] batch_offset The offset on axis # 3. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + */ + void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateBatchKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); + +private: + BatchConcatFunction *_func{nullptr}; + unsigned int _batch_offset{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp new file mode 100644 index 0000000000..c75e1e4477 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +template <typename T> +void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window) +{ + // Offset source + uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + + // Offset destination + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + depth_offset * dst->info()->strides_in_bytes()[2]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16 / dst->info()->element_size(); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + return Status{}; +} +} // namespace + +void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); + + _func = nullptr; + _depth_offset = depth_offset; + + switch (src->data_type()) + { + case DataType::QASYMM8: + _func = &depth_concat<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + _func = &depth_concat<int8_t>; + break; + case DataType::F16: + _func = &depth_concat<uint16_t>; + break; + case DataType::F32: + _func = &depth_concat<uint32_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int depth_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst)); + return Status{}; +} + +void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset, + window); +} + +const char *CpuConcatenateDepthKernel::name() const +{ + return "CpuConcatenateDepthKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h new file mode 100644 index 0000000000..54de9aff46 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +namespace cpu +{ +namespace kernels +{ +/** Interface for the depth concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class CpuConcatenateDepthKernel : public ICpuKernel<CpuConcatenateDepthKernel> +{ +public: + CpuConcatenateDepthKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] depth_offset The offset on the Z axis. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + * @note: The output tensor's low two dimensions can't be smaller than the input one's. + * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. + * + */ + void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateDepthKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); + +private: + DepthConcatFunction *_func{nullptr}; + unsigned int _depth_offset{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp new file mode 100644 index 0000000000..b6c11d948b --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); + for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + + return Status{}; +} +} // namespace + +void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); + + _height_offset = height_offset; + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst)); + return Status{}; +} + +void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Offset destination pointer to the correct position + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); + const int window_step_x = 16; + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1)); + + // Create iterators + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +const char *CpuConcatenateHeightKernel::name() const +{ + return "CpuConcatenateHeightKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h new file mode 100644 index 0000000000..df880c4878 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the height concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class CpuConcatenateHeightKernel : public ICpuKernel<CpuConcatenateHeightKernel> +{ +public: + CpuConcatenateHeightKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All + * @param[in] height_offset The starting offset on the Y axis for the output tensor. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + */ + void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateHeightKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _height_offset{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp new file mode 100644 index 0000000000..f6100cccca --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Steps.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); + + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + + return Status{}; +} +} // namespace + +void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); + ARM_COMPUTE_UNUSED(dst); + + _width_offset = width_offset; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst)); + return Status{}; +} + +void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Offset output pointer to the correct position + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _width_offset * dst->info()->strides_in_bytes()[0]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); + constexpr int window_step_x = 16; + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator src_it(src, win); + Iterator dst_it(dst, win); + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +const char *CpuConcatenateWidthKernel::name() const +{ + return "CpuConcatenateWidthKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h new file mode 100644 index 0000000000..560e44e35a --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the width concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class CpuConcatenateWidthKernel : public ICpuKernel<CpuConcatenateWidthKernel> +{ +public: + CpuConcatenateWidthKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All + * @param[in] width_offset The offset on the X axis. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + */ + void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateWidthKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _width_offset{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp new file mode 100644 index 0000000000..87703ec631 --- /dev/null +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) + +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialisation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + + ARM_COMPUTE_ERROR_THROW_ON( + CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); + + const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; + + const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL); + + const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx]; + const unsigned int num_channels = original_input_shape[channel_idx]; + + _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels; + _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3)); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); + + // Checks performed when dst is configured + if ((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x(); + const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y(); + const unsigned int element_size = src->info()->element_size(); + + Iterator input(src, window); + Iterator output(dst, window); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + memcpy(output.ptr() + id.x() * dst_stride_x + + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, + input.ptr(), element_size); + }, + input); +} + +const char *CpuConvertFullyConnectedWeightsKernel::name() const +{ + return "CpuConvertFullyConnectedWeightsKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h new file mode 100644 index 0000000000..2253889e69 --- /dev/null +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H +#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa. + * + * @note This function can be applied to the 2D weights used by a Fully Connected layer if: + * - It follows a Convolution layer + * - The data layout used by the network does not match the one the model has been trained in. + * + * @note This function assumes the weights are already reshaped (transposed) + */ +class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel<CpuConvertFullyConnectedWeightsKernel> +{ +public: + CpuConvertFullyConnectedWeightsKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel); + /** Set the src and dst tensor. + * + * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. + * @param[in] dst The converted weights tensor info. Shape and Data Type: Same as @p src. + * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _factor1{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ + unsigned int _factor2{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp new file mode 100644 index 0000000000..745b1566c2 --- /dev/null +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + + // Validate output if initialized + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Output auto inizialitation if not yet initialized + { + const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + const int offset_correction = is_input_signed ? -128 : 128; + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + + auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); + } + + return std::make_pair(Status{}, calculate_max_window(*dst)); +} +} // namespace + +void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + std::pair<Status, Window> win_config = validate_and_configure_window(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const uint8_t mask = 128; + const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); +} + +const char *CpuConvertQuantizedSignednessKernel::name() const +{ + return "CpuConvertQuantizedSignednessKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h new file mode 100644 index 0000000000..e94d3d5ef2 --- /dev/null +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H +#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ +class CpuConvertQuantizedSignednessKernel : public ICpuKernel<CpuConvertQuantizedSignednessKernel> +{ +public: + CpuConvertQuantizedSignednessKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel); + /** Initialize the kernel input and output info. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: opposite of @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuConvertQuantizedSignednessKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */ diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp new file mode 100644 index 0000000000..1b693d7a3a --- /dev/null +++ b/src/cpu/kernels/CpuCopyKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCopyKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4); + + // Validate destination if initialized + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Destination auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src); + return std::make_pair(Status{}, calculate_max_window(*dst)); +} + +std::pair<Status, Window> +validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +{ + const TensorShape src_shape = src->tensor_shape(); + const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape)); + // Configure window + const Window win = calculate_max_window(*dst, dst->dimension(0)); + return std::make_pair(Status{}, win); +} + +} // namespace + +void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding)); + + _padding = padding; + + std::pair<Status, Window> win_config; + if (padding.empty()) + { + win_config = validate_and_configure_window(src, dst); + } + else + { + win_config = validate_and_configure_window_with_padding(src, dst, padding); + } + + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, + const arm_compute::ITensorInfo *dst, + const PaddingList &padding) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding)); + + if (padding.empty()) + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); + } + + return Status{}; +} + +void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if (_padding.empty()) + { + Window dst_window{window}; + dst_window.set(Window::DimX, + Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); + Window out_slice = dst_window.first_slice_window_1D(); + do + { + Iterator src_it(src, out_slice); + Iterator dst_it(dst, out_slice); + + execute_window_loop( + out_slice, + [&](const Coordinates &) + { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); }, + src_it, dst_it); + } while (dst_window.slide_window_slice_1D(out_slice)); + } + else + { + Window src_window{window}; + src_window.set(Window::DimX, + Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); + + Iterator src_it(src, src_window); + Iterator dst_it(dst, window); + const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size(); + execute_window_loop( + window, + [&](const Coordinates &) + { + auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); + std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); + }, + src_it, dst_it); + } +} + +const char *CpuCopyKernel::name() const +{ + return "CpuCopyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h new file mode 100644 index 0000000000..a05053f07e --- /dev/null +++ b/src/cpu/kernels/CpuCopyKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H +#define ARM_COMPUTE_CPU_COPY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform a copy between two tensors */ +class CpuCopyKernel : public ICpuKernel<CpuCopyKernel> +{ +public: + CpuCopyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: same as @p src. + * @param[in] padding (Optional) Padding to be applied to the input tensor + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCopyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PaddingList _padding{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp new file mode 100644 index 0000000000..82e3a5ce00 --- /dev/null +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/cpu/kernels/depthwiseconv2d/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = { + {"neon_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)}, + {"neon_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)}, + {"neon_fp16_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)}, + {"neon_fp32_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)}, + {"neon_qp8_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)}, + {"neon_qp8_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)}, +}; + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > + src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > + src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || + (info.pad_stride_info.stride().second < 1)); + + if (is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); + + if (is_data_type_quantized_asymmetric(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + } + + if (dst->total_size() != 0) + { + const TensorShape output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info)); + + _has_biases = (biases != nullptr); + _conv_info = info; + + const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation( + DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + _func = uk->ukernel; + + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(output_shape) + .set_quantization_info(dst->quantization_info())); + + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info)); + return Status{}; +} + +void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + _func(src, weights, biases, dst, window, _has_biases, _conv_info); +} + +const char *CpuDepthwiseConv2dNativeKernel::name() const +{ + return "CpuDepthwiseConv2dNativeKernel"; +} + +const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> & +CpuDepthwiseConv2dNativeKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h new file mode 100644 index 0000000000..7e78f52e13 --- /dev/null +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H + +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/function_info/ConvolutionInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" +#include "support/AclRequires.h" + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#include <arm_neon.h> +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to run a depthwise convolution native on a tensor. */ +class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel> +{ +private: + using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>:: + type; + +public: + CpuDepthwiseConv2dNativeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel); + + /** Initialize the function's source, destination and parameters. + * + * @note Supported data layouts: NHWC + * + * @param[in] src Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor. Data type supported: Same as @p src. + * @param[in] info Depthwise convolution meta-data. + * + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + struct DepthwiseConv2dNativeKernel + { + const char *name; + const DepthwiseConv2dNativeDataTypeISASelectorPtr is_selected; + DepthwiseConv2dNativeKernelPtr ukernel; + }; + static const std::vector<DepthwiseConv2dNativeKernel> &get_available_kernels(); + +private: + /** Common signature for all the specialised depthwise convolution native functions + * + * @param[in] window Region on which to execute the kernel. + */ + DepthwiseConv2dNativeKernelPtr _func{nullptr}; + ConvolutionInfo _conv_info{}; + bool _has_biases{false}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp new file mode 100644 index 0000000000..5595ace998 --- /dev/null +++ b/src/cpu/kernels/CpuDequantizeKernel.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2017-2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDequantizeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/dequantize/generic/neon/list.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, + DataType::QSYMM16); + + if (dst->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); + + ICpuKernel::configure(win); + + switch (dst->data_type()) + { + case DataType::F32: + _func = REGISTER_FP32_NEON(fp32_run_dequantization_core); + break; +#ifdef ARM_COMPUTE_ENABLE_FP16 + case DataType::F16: + _func = REGISTER_FP16_NEON(fp16_run_dequantization_core); + break; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} + +Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + _func(src, dst, window); +} +const char *CpuDequantizeKernel::name() const +{ + return "CpuDequantizeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h new file mode 100644 index 0000000000..d8b6444f0a --- /dev/null +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the dequantization layer kernel. */ +class CpuDequantizeKernel : public ICpuKernel<CpuDequantizeKernel> +{ +public: + CpuDequantizeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel); + /** Set input, output tensors. + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuDequantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the specialised @ref CpuDequantizeKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using DequantizeFunctionExecutorPtr = void (*)(const ITensor *input, ITensor *output, const Window &window); + DequantizeFunctionExecutorPtr _func{nullptr}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUDEQUANTIZEKERNEL_H diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp new file mode 100644 index 0000000000..4cb0fb1c40 --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDirectConv2dKernel.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/directconv2d/list.h" + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = { + {"neon_fp32_nhwc_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)}, + {"neon_fp32_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)}, + {"neon_fp16_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)}, +}; + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + + const DataLayout data_layout = src->data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); + ARM_COMPUTE_UNUSED(width_idx); + // Checks performed when output is configured + if (dst->total_size() != 0) + { + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + DataType data_type = src->data_type(); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_UNUSED(src); + + Window win{}; + bool window_changed = false; + + // Configure window without any padding + win = calculate_max_window(*dst, Steps()); + + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +void CpuDirectConv2dKernel::configure(ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + _conv_info = conv_info; + _data_layout = src->data_layout(); + _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); + + // Get convolved dimensions + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + DataType data_type = src->data_type(); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, 1, data_type); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info)); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); + + return Status{}; +} + +void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const auto *uk = CpuDirectConv2dKernel::get_implementation( + DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + + uk->ukernel(window, src, weights, dst, _conv_info); +} +const char *CpuDirectConv2dKernel::name() const +{ + return "CpuDirectConvolutionLayerKernel"; +} + +const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> &CpuDirectConv2dKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h new file mode 100644 index 0000000000..ad4caea193 --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform Direct Convolution Layer. */ +class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel> +{ +private: + using DirectConv2dKernel_Ptr = std::add_pointer<void( + const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type; + +public: + CpuDirectConv2dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel); + /** Set the src, weights, and dst tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * + * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[out] dst Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct DirectConv2dKernel + { + const char *name; + const DataTypeDataLayoutSelectorPtr is_selected; + DirectConv2dKernel_Ptr ukernel; + }; + + static const std::vector<DirectConv2dKernel> &get_available_kernels(); + +private: + PadStrideInfo _conv_info{}; + unsigned int _kernel_size{0}; + DataLayout _data_layout{DataLayout::UNKNOWN}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp new file mode 100644 index 0000000000..d4af8bedaf --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); + + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index( + src->data_layout(), DataLayoutDimension::CHANNEL))); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + } + + if (src->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); + } + + // Checks performed when output is configured + if ((dst != nullptr) && (dst->total_size() != 0)) + { + if (is_data_type_float(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + else if (src->data_type() == DataType::S32) + { + // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo + ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && + (info.output_data_type != DataType::QASYMM8_SIGNED)); + } + + return Status{}; +} + +template <typename T> +typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type +output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + execute_window_loop( + win, + [&](const Coordinates &id) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; + auto v_in = wrapper::vloadq(in_ptr); + + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); + v_in = wrapper::vadd(v_in, vb); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; + wrapper::vstore(out_ptr, v_in); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; + } + }, + in, out); +} + +template <typename T> +typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type +output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); + auto v_in = wrapper::vloadq(in_ptr + x); + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + wrapper::vstore(out_ptr + x, v_in); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + *(out_ptr + x) = s_in; + } + }, + in, bi, out); +} + +// Quantized case +template < + typename TOut, + typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> +void output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12)}}; + + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); + v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb), + wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}}; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, out); +} +template < + typename TOut, + typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> +void output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{ + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12), + }}; + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + + wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); + wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); + wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); + wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32_t s_in = *in_ptr; + + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, bi, out); +} +} // namespace + +void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + _func = nullptr; + _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier; + _result_shift = info.result_shift; + _result_offset_after_shift = info.result_offset_after_shift; + + // Auto-initialize output output if required + if (dst != nullptr) + { + // Work out expected output data type + const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt)); + } + + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); + + const bool is_qasymm8_signed = + (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; + + // Set appropriate function + if (src->data_layout() == DataLayout::NCHW) + { + switch (src->data_type()) + { + case DataType::S32: + { + if (is_qasymm8_signed) + { + _func = &output_stage_nchw<int8_t>; + } + else + { + _func = &output_stage_nchw<uint8_t>; + } + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + _func = &output_stage_nchw<float16_t>; + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + _func = &output_stage_nchw<float>; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + } + } + } + else + { + switch (src->data_type()) + { + case DataType::S32: + { + if (is_qasymm8_signed) + { + _func = &output_stage_nhwc<int8_t>; + } + else + { + _func = &output_stage_nhwc<uint8_t>; + } + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + _func = &output_stage_nhwc<float16_t>; + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + _func = &output_stage_nhwc<float>; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + } + } + } +} + +Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + return Status{}; +} + +void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); +} + +const char *CpuDirectConv2dOutputStageKernel::name() const +{ + return "CpuDirectConv2dOutputStageKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h new file mode 100644 index 0000000000..ce84f49cf6 --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input. + * + * @note We assume bias to be shared + * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part + * of the @ref DirectConvolutionLayerOutputStageKernelInfo. + */ +class CpuDirectConv2dOutputStageKernel : public ICpuKernel<CpuDirectConv2dOutputStageKernel> +{ +public: + CpuDirectConv2dOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel); + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] src Input to add the bias to. If @p dst is not specified then accumulation is done in-place. + * Data type supported: F16/F32/S32 + * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src + * @param[out] dst (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. + * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 + * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata + */ + void + configure(ITensorInfo *src, + const ITensorInfo *bias = nullptr, + ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2dOutputStageKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, + const ITensorInfo *bias = nullptr, + const ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using OutputStageKernel = void(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift); + + OutputStageKernel *_func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp new file mode 100644 index 0000000000..b5b2aed1ba --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDirectConv3dKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/conv3d/neon/list.h" + +#include <algorithm> + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = { +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + {"neon_fp16_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)}, +#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)}, + {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)}, + {"neon_qasymm8_signed_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}}; + +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U)); + + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); + + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const DataLayout data_layout = src0->data_layout(); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + // Weight layout is D, H, W, Cin, Cout + ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx)); + + if (src2 != nullptr) + { + if (is_data_type_quantized(src0->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // Checks performed when output is configured + if (dst->total_size() != 0) + { + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + + DataType data_type = src0->data_type(); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type); + } + + return Status{}; +} +} // namespace + +void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(src2); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); + + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _conv_info = conv_info; + _run_method = uk->ukernel; + _name = std::string("CpuDirectConv3dKernel").append("/").append(uk->name); + + // Get convolved dimensions + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + + DataType data_type = src0->data_type(); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, 1, data_type); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, conv_info)); + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info)); + + return Status{}; +} + +void CpuDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, src2, dst, _conv_info, window); +} + +const char *CpuDirectConv3dKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h new file mode 100644 index 0000000000..8e6f564679 --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv3dKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H + +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform 3D Direct Convolution Layer. */ +class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel> +{ +private: + /* Template function for convolution 3d NDHWC */ + using DirectConv3dKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type; + +public: + CpuDirectConv3dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv3dKernel); + /** Set the src, weights, biases and dst tensor info. + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in, out] src0 Input tensor info. + * @param[in] src1 Set of kernels to convolve the input volume. + * The 2nd dimension must be the same as the input's volume 1st dimension. + * @param[in] src2 Set of biases. Can be nullptr. + * @param[out] dst Output tensor info. + * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. + * @param[in] conv_info Contains padding, stride, acitvation information. + * + */ + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv3dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct DirectConv3dKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + DirectConv3dKernelPtr ukernel; + }; + + static const std::vector<DirectConv3dKernel> &get_available_kernels(); + +private: + Conv3dInfo _conv_info{}; + DirectConv3dKernelPtr _run_method{nullptr}; + std::string _name{}; +}; + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_DIRECTCONV3D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp new file mode 100644 index 0000000000..57a3f39822 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuElementwiseKernel.h" + +#include "arm_compute/core/Helpers.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/elementwise_binary/list.h" + +#include <arm_neon.h> + +#if defined(ENABLE_FP32_KERNELS) +namespace +{ +static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; +static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; +static constexpr size_t default_div_mws_N1_fp32_neon = 19043; +static constexpr size_t default_div_mws_V1_fp32_neon = 25511; +} // namespace +#endif /* ENABLE_FP32_KERNELS */ + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +template <ArithmeticOperation op> +const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = { + {"sve2_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)}, + {"sve2_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)}, + {"sve_fp32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)}, + {"sve_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)}, + {"sve_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)}, + {"sve_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast<ArithmeticOperation>(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)}, + {"neon_fp32_arithmetic", + + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)}, + {"neon_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)}, + {"neon_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)}, + {"neon_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)}, + {"neon_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)}, + {"neon_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)}, +}; +template <ComparisonOperation op> +const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = { + {"sve2_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)}, + {"sve2_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)}, + {"sve_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)}, + {"sve_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)}, + {"sve_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)}, + {"sve_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)}, + {"sve_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast<ComparisonOperation>(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)}, + {"neon_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)}, + {"neon_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)}, + {"neon_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)}, + {"neon_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)}, + {"neon_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)}, + {"neon_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)}, + {"neon_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)}, +}; +} // namespace + +const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> & +CpuArithmeticKernel::get_available_kernels() +{ + static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels; + std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), + available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), + available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), + available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), + available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), + available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), + available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), + available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), + available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels)); + + return available_kernels; +} + +const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> & +CpuComparisonKernel::get_available_kernels() +{ + static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels; + std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), + available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), + available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), + available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), + available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), + available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), + available_kernels_comperison<ComparisonOperation::LessEqual>.end(), + std::back_inserter(available_kernels)); + + return available_kernels; +} + +template <class Derived> +Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, + const ITensorInfo &src1, + const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + const auto *uk = CpuArithmeticKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)}); + + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + _run_method = uk->ukernel; + _name = std::string("CpuArithmeticKernel").append("/").append(uk->name); + + // If any of shapes is dynamic, expect a configured window and dst at run-time. + if (src0->is_dynamic() || src1->is_dynamic()) + { + return; + } + + auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape()); + auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type()); + ICpuKernel::configure(shape_and_window.second); +} + +void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + const auto *uk = CpuComparisonKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)}); + + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + _run_method = uk->ukernel; + _name = std::string("CpuComparisonKernel").append("/").append(uk->name); + + // If any of shapes is dynamic, expect a configured window and dst at run-time. + if (src0->is_dynamic() || src1->is_dynamic()) + { + return; + } + + auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape()); + auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type()); + ICpuKernel::configure(shape_and_window.second); +} + +template <class Derived> +void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, window); +} +template void +CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); +template void +CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); + +template <class Derived> +const char *CpuElementwiseKernel<Derived>::name() const +{ + return _name.c_str(); +} +template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const; +template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const; + +/** Arithmetic operators (min, max, squared_diff) */ +void CpuArithmeticKernel::configure(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = op; + CpuArithmeticKernel::configure_common(src0, src1, dst); +} + +Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::F16, DataType::S32, DataType::F32); + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + } + return validate_arguments_common(src0, src1, dst); +} + +Status CpuArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> || + this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>) + { + size_t mws = ICPPKernel::default_mws; + if (platform.get_cpu_model() == CPUModel::N1) + { + mws = default_min_max_mws_N1_fp32_neon; + } + else if (platform.get_cpu_model() == CPUModel::V1) + { + mws = default_min_max_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if (this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + +/** The division operator */ + +void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = ArithmeticOperation::DIV; + CpuArithmeticKernel::configure_common(src0, src1, dst); +} + +size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>) + { + size_t mws = ICPPKernel::default_mws; + if (platform.get_cpu_model() == CPUModel::N1) + { + mws = default_div_mws_N1_fp32_neon; + } + else if (platform.get_cpu_model() == CPUModel::V1) + { + mws = default_div_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if (this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + +Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32); + return CpuArithmeticKernel::validate_arguments(src0, src1, dst); +} + +Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +/** The power operator */ +void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = ArithmeticOperation::POWER; + CpuArithmeticKernel::configure_common(src0, src1, dst); +} + +Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32); + return CpuArithmeticKernel::validate_arguments(src0, src1, dst); +} + +Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */ +void CpuComparisonKernel::configure(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = op; + CpuComparisonKernel::configure_common(src0, src1, dst); +} + +Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, + DataType::S32, DataType::F32); + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8); + } + return validate_arguments_common(src0, src1, dst); +} + +Status CpuComparisonKernel::validate(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h new file mode 100644 index 0000000000..1f3e613b80 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for an element-wise operation kernel + * + * Element-wise operation is computed by: + * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f] + * + */ +template <class Derived> +class CpuElementwiseKernel : public ICpuKernel<Derived> +{ +private: + using ElementwiseKernelPtr = + std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type; + +public: + CpuElementwiseKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel); + + using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &); + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + const char *name() const override; + + struct ElementwiseKernel + { + const char *name; + const ElementwiseDataTypeISASelectorPtr is_selected; + ElementwiseKernelPtr ukernel; + }; + +protected: + /** Validate the argument passed to the kernel + * + * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32. + * @param[in] src1 Second tensor input. Data types supported: Same as @p src0. + * @param[in] dst Output tensor. Data types supported: Dependent on subclass. + */ + static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + +protected: + ElementwiseKernelPtr _run_method{nullptr}; + std::string _name{}; +}; + +class CpuArithmeticKernel : public CpuElementwiseKernel<CpuArithmeticKernel> +{ +public: + CpuArithmeticKernel() = default; + + /** Configure kernel + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuArithmeticKernel::configure() + * + * @return a status + */ + static Status + validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels(); + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + +protected: + /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff) + */ + void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + + ArithmeticOperation _op{}; +}; + +class CpuDivisionKernel : public CpuArithmeticKernel +{ +public: + CpuDivisionKernel() = default; + + /** Configure kernel + * + * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDivisionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); +}; + +class CpuPowerKernel : public CpuArithmeticKernel +{ +public: + CpuPowerKernel() = default; + + /** Configure kernel + * + * @param[in] src0 First tensor input info. Data types supported: F16/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPowerKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); +}; + +class CpuComparisonKernel : public CpuElementwiseKernel<CpuComparisonKernel> +{ +public: + CpuComparisonKernel() = default; + + /** Configure kernel + * + * @param[in] op Comparison operation to be executed. + * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U8. + */ + void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuComparisonKernel::configure() + * + * @return a status + */ + static Status + validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels(); + +protected: + /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff) + */ + void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + +private: + /** Function to get the micro kernel implementation + * + * @param[in] src0 First input tensor information + * @param[in] src1 Second input tensor information + * @param[in] dst Output tensor information + * + * @return the function instance for the micro kernel + */ + + ComparisonOperation _op{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp new file mode 100644 index 0000000000..88545ee756 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/elementwise_unary/list.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +#ifdef __aarch64__ + +std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON(src->data_type() != dst->data_type()); + ARM_COMPUTE_ERROR_ON(!is_data_type_quantized(src->data_type())); + ARM_COMPUTE_ERROR_ON(src->element_size() != 1); + + auto lut = std::unique_ptr<uint8_t[]>(new uint8_t[256]); + const auto is_signed = src->data_type() == DataType::QASYMM8_SIGNED; + const auto src_qi = src->quantization_info().uniform(); + const auto dst_qi = dst->quantization_info().uniform(); + + const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale; + const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale; + + for (int i = 0; i < 256; ++i) + { + const auto in = + (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi); + float result = 0; + + switch (op) + { + case ElementWiseUnary::RSQRT: + result = 1 / sqrt(in); + break; + + case ElementWiseUnary::EXP: + result = std::exp(in); + break; + + case ElementWiseUnary::NEG: + result = -in; + break; + + case ElementWiseUnary::LOG: + result = std::log(in); + break; + + case ElementWiseUnary::ABS: + result = std::abs(in); + break; + + case ElementWiseUnary::ROUND: + result = support::cpp11::nearbyint(in); + break; + + case ElementWiseUnary::SIN: + result = std::sin(in); + break; + + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + result = utility::clamp(result, dst_min_fp, dst_max_fp); + + const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) + : quantize_qasymm8(result, dst_qi); + lut[i] = out; + } + + return lut; +} + +#endif // __aarch64__ + +static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = { + { + "sve_fp32_elementwise_unary", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); }, + REGISTER_FP32_SVE(sve_fp32_elementwise_unary), + nullptr, + }, + { + "sve_fp16_elementwise_unary", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); }, + REGISTER_FP16_SVE(sve_fp16_elementwise_unary), + nullptr, + }, + { + "sve_s32_elementwise_unary", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); }, + REGISTER_INTEGER_SVE(sve_s32_elementwise_unary), + nullptr, + }, + { + "neon_fp32_elementwise_unary", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(neon_fp32_elementwise_unary), + nullptr, + }, + { + "neon_fp16_elementwise_unary", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_elementwise_unary), + nullptr, + }, + { + "neon_s32_elementwise_unary", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(neon_s32_elementwise_unary), + nullptr, + }, +#ifdef __aarch64__ + { + "sve2_q8_elementwise_unary", + [](const DataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary), + &q8_prepare_lut, + }, + { + "neon_q8_elementwise_unary", + [](const DataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary), + &q8_prepare_lut, + }, +#else // __aarch64__ + { + "neon_qasymm8_signed_elementwise_unary", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary), + nullptr, + }, + { + "neon_qasymm8_elementwise_unary", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary), + nullptr, + }, +#endif // __aarch64__ +}; + +} // namespace + +void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst)); + const auto uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + _op = op; + _run_method = uk->ukernel; + _name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name); + + // If input shape is dynamic, expect a configured window and dst at run-time. + if (src.is_dynamic()) + { + return; + } + + if (uk->prepare_func != nullptr) + { + _lut = uk->prepare_func(op, &src, &dst); + } + + auto shape_and_window = compute_output_shape_and_window(src.tensor_shape()); + auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type()); + ICpuKernel::configure(shape_and_window.second); +} + +Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); + + const auto *uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); + + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + switch (op) + { + case ElementWiseUnary::EXP: + case ElementWiseUnary::RSQRT: + case ElementWiseUnary::LOG: + case ElementWiseUnary::ROUND: + case ElementWiseUnary::SIN: + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + break; + case ElementWiseUnary::NEG: + case ElementWiseUnary::ABS: + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + break; + default: + ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported"); + } + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + } + + return Status{}; +} + +void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, dst, window, _op, _lut.get()); +} + +const char *CpuElementwiseUnaryKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> &CpuElementwiseUnaryKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h new file mode 100644 index 0000000000..249909854e --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2018-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H + +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for an element-wise unary operation kernel + * + * Element-wise operation is computed by: + * @f[ dst(x) = OP(src(x))@f] + */ +class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel> +{ +private: + using ElementwiseUnaryUkernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type; + using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>( + ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type; + +public: + CpuElementwiseUnaryKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel); + + /** Function to configure the @ref CpuElementwiseUnaryKernel + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. + * @param[out] dst Output tensor. Data types supported: Same as @p src. + */ + void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuElementwiseUnaryKernel::configure() + * + * @return a status + */ + static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct ElementwiseUnaryKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + ElementwiseUnaryUkernelPtr ukernel; + ElementwiseUnaryPreparePtr prepare_func; + }; + + static const std::vector<ElementwiseUnaryKernel> &get_available_kernels(); + +private: + ElementWiseUnary _op{}; + ElementwiseUnaryUkernelPtr _run_method{nullptr}; + std::string _name{}; + std::unique_ptr<uint8_t[]> _lut{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp new file mode 100644 index 0000000000..754da97ae1 --- /dev/null +++ b/src/cpu/kernels/CpuFillKernel.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuFillKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + _constant_value = constant_value; + + // Configure kernel window + Window win = calculate_max_window(*tensor, Steps()); + ICpuKernel::configure(win); +} + +void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST); + + // Collapse all the batches on the third dimension + bool has_collapsed = true; + Window collapsed = window.collapse_if_possible(window, Window::DimZ, &has_collapsed); + ARM_COMPUTE_ERROR_ON(!has_collapsed); + + uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor); + const auto window_width = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start()); + const size_t element_size = inout->info()->element_size(); + + // Unroll X dimension + collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator tensor_it(inout, collapsed); + execute_window_loop( + collapsed, + [&](const Coordinates &) + { + uint8_t *base_addr = start_valid_region + tensor_it.offset(); + // Set memory + for (int i = 0; i < window_width; ++i) + { + std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); + } + }, + tensor_it); +} + +const char *CpuFillKernel::name() const +{ + return "CpuFillKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h new file mode 100644 index 0000000000..7c200c9b59 --- /dev/null +++ b/src/cpu/kernels/CpuFillKernel.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H +#define ARM_COMPUTE_CPU_FILL_KERNEL_H + +#include "arm_compute/core/PixelValue.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel for filling a tensor with a given constant value */ +class CpuFillKernel : public ICpuKernel<CpuFillKernel> +{ +public: + CpuFillKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel); + /** Configure kernel for a given list of arguments + * + * @param[in,out] tensor Tensor to fill. Supported data types: All + * @param[in] constant_value The value used to fill the planes of the tensor + */ + void configure(const ITensorInfo *tensor, const PixelValue &constant_value); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PixelValue _constant_value{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */ diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp new file mode 100644 index 0000000000..df7e6aad46 --- /dev/null +++ b/src/cpu/kernels/CpuFloorKernel.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuFloorKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/floor/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = { + {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)}, + {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + // Validate in case of configured output + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); + + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuFloorKernel").append("/").append(uk->name); + + // Configure kernel window + const Window win = calculate_max_window(*src, Steps()); + + ICPPKernel::configure(win); +} + +Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst))); + + Window win; + win.use_tensor_dimensions(src->tensor_shape()); + return win; +} + +Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + +void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start()); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + execute_window_loop( + win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it); +} + +const char *CpuFloorKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuFloorKernel::FloorKernel> &CpuFloorKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h new file mode 100644 index 0000000000..57107d0532 --- /dev/null +++ b/src/cpu/kernels/CpuFloorKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H +#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Cpu accelarated kernel to perform a floor operation */ +class CpuFloorKernel : public ICpuKernel<CpuFloorKernel> +{ +private: + using FloorKernelPtr = std::add_pointer<void(const void *, void *, int)>::type; + +public: + CpuFloorKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor. Data type supported: F16/F32. + * @param[out] dst Destination tensor. Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuFloorKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + /** Infer execution window + * + * @param[in] src Source tensor info. Data type supported: F16/F32. + * @param[in] dst Destination tensor info. Same as @p src + * + * @return an execution Window + */ + Window infer_window(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct FloorKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + FloorKernelPtr ukernel; + }; + + static const std::vector<FloorKernel> &get_available_kernels(); + +private: + FloorKernelPtr _run_method{nullptr}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..db433c99a8 --- /dev/null +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_interleaved_shape(*src))); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst)); + + Window win = calculate_max_window(*src, Steps(1, 4)); + ICPPKernel::configure(win); +} + +Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if (dst->total_size() != 0) + { + const TensorShape dst_shape = compute_interleaved_shape(*src); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + const size_t window_start_x = window.x().start(); + const size_t window_end_x = window.x().end(); + + const size_t in_height = src->info()->dimension(1); + const size_t in_stride = src->info()->strides_in_bytes()[1]; + + const size_t partial_y = in_height % 4; + + const size_t element_size = src->info()->element_size(); + + // Set window for the src tensor + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Set window for the dst tensor + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.scale(Window::DimY, 0.25f); + + Iterator in(src, win); + Iterator out(dst, win_out); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + if (id.y() + 4 <= static_cast<int>(in_height)) + { + for (size_t x = window_start_x; x < window_end_x; ++x) + { + std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, + element_size); + } + } + else + { + for (size_t x = window_start_x; x < window_end_x; ++x) + { + size_t y = 0; + for (; y < partial_y; ++y) + { + std::memcpy(out.ptr() + (x * 4 + y) * element_size, + (in.ptr() + y * in_stride) + x * element_size, element_size); + } + for (; y < 4; ++y) + { + std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + } + } + } + }, + in, out); +} + +const char *CpuGemmInterleave4x4Kernel::name() const +{ + return "CpuGemmInterleave4x4Kernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h new file mode 100644 index 0000000000..2ce34bc4bc --- /dev/null +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to interleave the elements of a matrix + * + * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\ + * \end{array} \right) + * @f] + * + * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ +class CpuGemmInterleave4x4Kernel : public ICpuKernel<CpuGemmInterleave4x4Kernel> +{ +public: + CpuGemmInterleave4x4Kernel() = default; + /** Initialise the kernel's src and dst. + * + * @param[in] src Input tensor info. Data types supported: All + * @param[out] dst Output tensor info which stores the interleaved matrix. Data type supported: same as @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmInterleave4x4Kernel + * + * Similar to @ref CpuGemmInterleave4x4Kernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..a3ed2cd171 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -0,0 +1,877 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +void inline vector_matrix_multiply_u8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) +{ + execute_window_loop( + window, + [&](const Coordinates &id) + { + if (id.x() > width_b) + { + return; + } + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const uint8x8_t a00_u8 = vld1_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); + const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); + const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); + const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); + const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); + const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); + const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); + const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}}; + + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + const uint16x4x4_t b10_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}}; + + const uint16x4x4_t b20_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}}; + + const uint16x4x4_t b30_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}}; + + const uint16x4x4_t b40_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}}; + + const uint16x4x4_t b50_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}}; + + const uint16x4x4_t b60_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}}; + + const uint16x4x4_t b70_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } + + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) + { + const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b); + + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.x() < (width_out - 16)) + { + vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); + } + else + { + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } + } + } + }, + ina, inb, out); +} + +void inline vector_matrix_multiply_s8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) +{ + execute_window_loop( + window, + [&](const Coordinates &id) + { + if (id.x() > width_b) + { + return; + } + + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const int8x8_t a00_s8 = vld1_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); + const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); + const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); + const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); + const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); + const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); + const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); + const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); + + // Convert a00_s8 to int16_t and get the lower part + const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}}; + + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + const int16x4x4_t b10_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}}; + + const int16x4x4_t b20_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}}; + + const int16x4x4_t b30_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}}; + + const int16x4x4_t b40_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}}; + + const int16x4x4_t b50_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}}; + + const int16x4x4_t b60_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}}; + + const int16x4x4_t b70_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } + + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) + { + const int8x8_t a00_s8 = vld1_dup_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b); + + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.x() < (width_out - 16)) + { + vst1q_s32(vec_out + 0, c0.val[0]); + vst1q_s32(vec_out + 4, c0.val[1]); + vst1q_s32(vec_out + 8, c0.val[2]); + vst1q_s32(vec_out + 12, c0.val[3]); + } + else + { + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } + } + } + }, + ina, inb, out); +} + +void inline matrix_multiply_u8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +{ + const auto width_out = static_cast<int>(out_info.dimension(0)); + const auto height_out = static_cast<int>(out_info.dimension(1)); + const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 1 + uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 2 + uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 3 + uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const uint8x8_t a00_u8 = vld1_u8(mtx_a0); + const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Convert b00_s8 to uint16_t + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); + c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); + c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); + c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); + c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); + c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); + c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); + c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); + c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); + c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); + } + + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); + + if (id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); + if (id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); + vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); + vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); + vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); + if (id.y() + 2 < height_out) + { + vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); + vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); + vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); + vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); + vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); + vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); + vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + } + } + } + } + else + { + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + k * 4 + j) = c0.val[k][j]; + } + } + if (id.y() + 1 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + } + } + if (id.y() + 2 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } + } + } + } + } + } + }, + ina, inb, out); +} + +void inline matrix_multiply_s8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +{ + const auto width_out = static_cast<int>(out_info.dimension(0)); + const auto height_out = static_cast<int>(out_info.dimension(1)); + const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr()); + auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr()); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 1 + int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 2 + int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 3 + int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const int8x8_t a00_s8 = vld1_s8(mtx_a0); + const int8x16_t b00_s8 = vld1q_s8(mtx_b0); + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Convert b00_s8 to int16_t + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); + c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); + c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); + c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); + c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); + c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); + c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); + c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); + c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); + c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); + } + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); + vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); + vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); + vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); + if (id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); + vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); + vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); + vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); + if (id.y() + 2 < height_out) + { + vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); + vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); + vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); + vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); + vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); + vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); + vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + } + } + } + } + else if (id.y() < height_out) + { + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + k * 4 + j) = c0.val[k][j]; + } + } + if (id.y() + 1 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + } + } + if (id.y() + 2 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } + } + } + } + } + } + }, + ina, inb, out); +} + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + + TensorShape in0_shape = src0->tensor_shape(); + TensorShape in1_shape = src1->tensor_shape(); + TensorShape out_shape = dst->tensor_shape(); + + // Check vector-by-matrix case + if (out_shape[1] == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], + "The number of input0's columns must be equal to input1's rows"); + } + else + { + in0_shape.collapse(2); + in1_shape.collapse(2); + out_shape.collapse(2); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], + "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], + "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16"); + } + + return Status{}; +} +} // namespace + +void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src0); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst)); + + TensorShape in1_shape = src1->tensor_shape(); + in1_shape.collapse(2); + + _slide_matrix_b = in1_shape[2] != 1; + + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win; + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if ((dst->dimension(1) == 1)) + { + // Configure kernel window + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); + } + else + { + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + } + + ICpuKernel::configure(win); +} + +Status +CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); + return Status{}; +} + +void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path + if ((dst->info()->dimension(1) == 1)) + { + const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0)); + const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0)); + const auto width_out = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = + static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (src1->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, win_out); + + switch (src0->info()->data_type()) + { + case DataType::S8: + case DataType::QASYMM8_SIGNED: + { + vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + } + else + { + const size_t in_b_stride = src1->info()->strides_in_bytes()[1]; + const int width_b = src1->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1)); + + // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (_slide_matrix_b) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // The step x and step y for the output matrix has been already set using in configure() + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, window); + + switch (src0->info()->data_type()) + { + case DataType::S8: + case DataType::QASYMM8_SIGNED: + { + matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + } +} + +const char *CpuGemmLowpMatrixMultiplyKernel::name() const +{ + return "CpuGemmLowpMatrixMultiplyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h new file mode 100644 index 0000000000..439ada1b47 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to multiply matrices + * + * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel + * This kernel performs the following computation: + * + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 + * + */ +class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel<CpuGemmLowpMatrixMultiplyKernel> +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel); + /** Initialise the kernel's input and output. + * + * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two + * kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED + * @param[in] src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output tensor info to store the result of matrix multiplication. Data type supported: S32 + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + bool _slide_matrix_b{true}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/ diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp new file mode 100644 index 0000000000..9a099bd1b6 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp @@ -0,0 +1,449 @@ +/* + * Copyright (c) 2017-2021,2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(1), + "Output vector must have length equal to the number of rows of the input matrix"); + } + return Status{}; +} +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(0), + "Output vector must have length equal to the number of columns of the input matrix"); + } + return Status{}; +} +} // namespace + +void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info)); + _k = info.k; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; + + switch (src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32); + + Window win = calculate_max_window(*dst, Steps(1)); + ICpuKernel::configure(win); +} + +Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); + return Status{}; +} + +template <typename T> +void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const arm_compute::Window &window) +{ + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t<T>; + using TAcc = wrapper::traits::promote_t<TIAcc>; + + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + + Window win_input(collapsed_window); + win_input.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_input.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_input.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator in(src, win_input); + Iterator out(dst, collapsed_window); + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; + + const T *matrix_a = reinterpret_cast<const T *>( + (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); +#endif /* __arm__ */ + + int i = 0; + // This for loop performs 16 accumulations + for (; i <= (_k - 16); i += 16) + { + const auto a0_d8 = wrapper::vloadq(matrix_a + i); + + // Partial accumulations in U16 + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); + + // Accumulate to U32 + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); + } + + // This for loop performs the leftover accumulations + for (; i < _k; ++i) + { + sum_row += static_cast<TAcc>(matrix_a[i]); + } + +#if defined(__aarch64__) + // Reduction operation available on 64 bit architectures only + sum_row += wrapper::vaddv(vsum_row); +#else // __aarch64__ + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); + + sum_row += wrapper::vgetlane(tmp, 0); +#endif // __aarch64__ + + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_row *= _scalar; + } + + *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row); + }, + in, out); +} + +void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window); +} + +const char *CpuGemmLowpMatrixAReductionKernel::name() const +{ + return "CpuGemmLowpMatrixAReductionKernel"; +} + +void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); + + _k = info.k; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + switch (src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32); + + // Configure kernel window + Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration)); + ICpuKernel::configure(win); +} + +Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); + return Status{}; +} + +template <typename T> +void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info) +{ + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t<T>; + using TAcc = wrapper::traits::promote_t<TIAcc>; + + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{}); + + const auto width_matrix_b = static_cast<int>(src->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(src->info()->strides_in_bytes()[1]); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(collapsed_window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + + Window win_in(win_out); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator inb(src, win_in); + Iterator out(dst, win_out); + + execute_window_loop( + win_out, + [&](const Coordinates &id) + { + if (id.x() > width_matrix_b) + { + return; + } + + // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation + // 4 x u/int32x4_t = 16 column accumulators + typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = { + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})}; + + const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); +#endif /* __arm__ */ + + // If we have less than 16 columns left, we can't use the main unrolled loop + if ((width_matrix_b - id.x()) >= 16) + { + // Row index + int i = 0; + // 4 x u/int32x4_t = 16 columns unrolled across 4 rows + for (; i <= (_k - 4); i += 4) + { + // Load 4 rows of 16 columns of 8bit elements + // (| | ) + // (| | ) + // (| | ) + // (| | ) + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); +#endif /* __arm__ */ + + // Partial accumulation to 16bit (4 rows => 2 rows) + // (| | | ) + // (| | | ) + typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = + {wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})}; + + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit (2 rows => 1 row) + // (| | | | | ) + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); + + matrix_b += 4 * in_b_stride; + } + + // This for loop accumulates the rows left over from the 4x unrolling above + for (; i < _k; ++i) + { + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + + // Convert 8bit => 16bit + const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type + b0_b16[2]{wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))}; + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); + + matrix_b += in_b_stride; + } + } + else + { + // Accumulate left over columns to sum_cols + for (int i = 0; i < _k; ++i) // row loop + { + auto left_over_cols = width_matrix_b - id.x(); + auto l = left_over_cols; + for (auto k = 0; k < 4 && l; ++k) + { + for (auto j = 0; j < 4 && l; ++j, --l) + { + sum_col[k][j] += matrix_b[left_over_cols - l]; + } + } + matrix_b += in_b_stride; + } + } + + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + + auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); + if ((width_matrix_b - id.x()) >= 16) + { + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); + } + else + { + auto left_over = width_matrix_b - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + } + } + } + }, + inb, out); +} + +void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window, info); +} + +const char *CpuGemmLowpMatrixBReductionKernel::name() const +{ + return "CpuGemmLowpMatrixBReductionKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h new file mode 100644 index 0000000000..20ef17e96d --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +struct GEMMLowpReductionKernelInfo; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel<CpuGemmLowpMatrixAReductionKernel> +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixAReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_a_cols) Number of matrix A columns + * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced column must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixAReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window); + + CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; +}; + +/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel<CpuGemmLowpMatrixBReductionKernel> +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixBReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_b_rows) Number of matrix B rows. + * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced row must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixBReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + * @param[in] info Thread-related information + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info); + + CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..2a76a5958d --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2017-2022,2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32, DataType::F32); + + // We run if the offset is nonzero or a sum col has been provided, we need + // the second option in case the QuantizationInfo is dynamic + if (a_offset != 0 || vector_sum_col != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // We run if the offset is nonzero or a sum row has been provided, we need + // the second option in case the QuantizationInfo is dynamic + if (b_offset != 0 || vector_sum_row != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if (output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if (vector_sum_col != nullptr) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + return Status{}; +} + +void run_offset_contribution_float(const Window &window, + ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + float scale, + bool slide_vector_sum_col, + bool is_gemm3d) +{ + Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + + // if vector_sum_col is nullptr then stride_y is 0, else get stride_y + const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0; + Iterator mm_result_it(mm_result, collapsed_window); + + if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y); + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<float *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = { + {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; + + offset_term_s32.val[0] = + vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = + vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = + vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = + vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4), + vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}}; + + // Convert and scale the S32 offsets to match the already scaled GEMM results + float32x4x4_t offset_terms_scaled = {{ + vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[0]), scale), + vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[1]), scale), + vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[2]), scale), + vmulq_n_f32(vcvtq_f32_s32(offset_term_s32.val[3]), scale), + }}; + + // Add the offset terms to the GEMM result + in_f32.val[0] = vaddq_f32(in_f32.val[0], offset_terms_scaled.val[0]); + in_f32.val[1] = vaddq_f32(in_f32.val[1], offset_terms_scaled.val[1]); + in_f32.val[2] = vaddq_f32(in_f32.val[2], offset_terms_scaled.val[2]); + in_f32.val[3] = vaddq_f32(in_f32.val[3], offset_terms_scaled.val[3]); + + // Store the result with the offset contribution + vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]); + vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]); + vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]); + vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + a_offset_term_s32 *= a_offset; + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += (k_offset + a_offset_term_s32 + b_offset_term_s32) * scale; + } + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it); + } + else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast<float *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t row_sum = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + float scaled_b_offset_term_f32 = row_sum * b_offset * scale; + + const float32x4_t b_offset_term_f32_vec = vdupq_n_f32(scaled_b_offset_term_f32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4), + vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_f32.val[0] = vaddq_f32(in_f32.val[0], b_offset_term_f32_vec); + in_f32.val[1] = vaddq_f32(in_f32.val[1], b_offset_term_f32_vec); + in_f32.val[2] = vaddq_f32(in_f32.val[2], b_offset_term_f32_vec); + in_f32.val[3] = vaddq_f32(in_f32.val[3], b_offset_term_f32_vec); + + // Store the result with the offset contribution + vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]); + vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]); + vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]); + vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += scaled_b_offset_term_f32; + } + }, + vector_sum_row_it, mm_result_it); + } + else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = + batch_id * + (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<float *>(mm_result_it.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + float32x4x4_t a_offset_term_scaled = {{ + vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[0]), scale), + vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[1]), scale), + vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[2]), scale), + vmulq_n_f32(vcvtq_f32_s32(a_offset_term_s32.val[3]), scale), + }}; + + float32x4x4_t in_f32 = {{vld1q_f32(mm_result_ptr + x + 0), vld1q_f32(mm_result_ptr + x + 4), + vld1q_f32(mm_result_ptr + x + 8), vld1q_f32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_f32.val[0] = vaddq_f32(in_f32.val[0], a_offset_term_scaled.val[0]); + in_f32.val[1] = vaddq_f32(in_f32.val[1], a_offset_term_scaled.val[1]); + in_f32.val[2] = vaddq_f32(in_f32.val[2], a_offset_term_scaled.val[2]); + in_f32.val[3] = vaddq_f32(in_f32.val[3], a_offset_term_scaled.val[3]); + + // Store the result with the offset contribution + vst1q_f32(mm_result_ptr + x + 0, in_f32.val[0]); + vst1q_f32(mm_result_ptr + x + 4, in_f32.val[1]); + vst1q_f32(mm_result_ptr + x + 8, in_f32.val[2]); + vst1q_f32(mm_result_ptr + x + 12, in_f32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset * scale; + } + }, + vector_sum_col_it, mm_result_it); + } + else // false, false + { + // No offset contribution from matrix A and matrix B + return; + } +} + +void run_offset_contribution(const Window &window, + ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool slide_vector_sum_col, + bool is_gemm3d) +{ + Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + + // if vector_sum_col is nullptr then stride_y is 0, else get stride_y + const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0; + Iterator mm_result_it(mm_result, collapsed_window); + + if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y); + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = { + {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; + + offset_term_s32.val[0] = + vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = + vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = + vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = + vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + a_offset_term_s32 *= a_offset; + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; + } + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it); + } + else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += b_offset_term_s32; + } + }, + vector_sum_row_it, mm_result_it); + } + else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = + batch_id * + (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset; + } + }, + vector_sum_col_it, mm_result_it); + } + else // false, false + { + // No offset contribution from matrix A and matrix B + return; + } +} +} // namespace + +void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset, + float scale) +{ + // Perform validate step + ARM_COMPUTE_UNUSED(vector_sum_row); + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + + _a_offset = a_offset; + _b_offset = b_offset; + _k = k; + + _scale = scale; + + if (vector_sum_col != nullptr) + { + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; + } + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps()); + ICpuKernel::configure(win); +} + +void CpuGemmLowpOffsetContributionKernel::set_a_offset(int32_t a_offset) +{ + _a_offset = a_offset; +} + +void CpuGemmLowpOffsetContributionKernel::set_b_offset(int32_t b_offset) +{ + _b_offset = b_offset; +} + +void CpuGemmLowpOffsetContributionKernel::set_scale(float scale) +{ + _scale = scale; +} + +Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + return Status{}; +} + +void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto mm_result = tensors.get_tensor(TensorType::ACL_DST); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + + // check to see what is the output type of result + auto k_offset = _a_offset * _b_offset * _k; + if (mm_result->info()->data_type() == DataType::F32) + { + run_offset_contribution_float(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, k_offset, + _scale, _slide_vector_sum_col, reinterpret_as_3d); + } + else + { + run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, k_offset, + _slide_vector_sum_col, reinterpret_as_3d); + } +} + +const char *CpuGemmLowpOffsetContributionKernel::name() const +{ + return "CpuGemmLowpOffsetContributionKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..ecbfb0c282 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2017-2022,2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class CpuGemmLowpOffsetContributionKernel : public ICpuKernel<CpuGemmLowpOffsetContributionKernel> +{ +public: + /** Default constructor */ + CpuGemmLowpOffsetContributionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel); + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] scale (Optional) multiplies the contribution to make it the same scale as the dst in the case where mm_result is float + * (and so has already been scaled). Default is 1.0 + */ + void configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset, + float scale = 1.0f); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOffsetContributionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset); + + /** Set the a offset + * Warning: if a_offset is non-zero then vector_sum_col must be set in run_op. + * Run configure or validate again if you aren't sure + * + * @param[in] a_offset Offset to be added to each element of the matrix A. + */ + void set_a_offset(int32_t a_offset); + + /** Set the b offset + * Warning: if b_offset is non-zero then vector_sum_row must be set in run_op. + * Run configure or validate again if you aren't sure + * + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void set_b_offset(int32_t b_offset); + + /** Set the dequantize scale + * + * @param[in] scale Multiplies the contribution to make it the same scale as the dst in the case where + * mm_result is float (and so has already been scaled). + */ + void set_scale(float scale); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k{0}; // Number of columns of A or rows of B, used in last offset term + float _scale{1.0}; + bool _slide_vector_sum_col{true}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp new file mode 100644 index 0000000000..3c113f2828 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp @@ -0,0 +1,1036 @@ +/* + * Copyright (c) 2019-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) +{ + return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}}; +} + +inline int32x4x4_t load(const int32_t *ptr, int32_t x) +{ + return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}}; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) +{ + return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}}; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) +{ + return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3])}}; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) +{ + return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar)}}; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) +{ + return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}}; +} + +inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) +{ + int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x); + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + return a_offset_term_s32; +} + +inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset) +{ + int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); + return b_offset_term_s32; +} + +inline int32x4x4_t get_k_offset(int32_t k_offset) +{ + return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; +} + +inline uint8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; + + // Convert S16 to U8 + uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + if (is_bounded_relu) + { + out_u8 = vmaxq_u8(out_u8, min_u8); + out_u8 = vminq_u8(out_u8, max_u8); + } + + return out_u8; +} + +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if (is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0])); + in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1])); + in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2])); + in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3])); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if (is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +template <typename T> +struct VectorTyper +{ + using stype = T; + using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>; +}; + +inline Window get_win_vector_sum(const Window &window) +{ + Window win_vector_sum(window); + win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0)); + return win_vector_sum; +} + +inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col) +{ + Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window)); + return vector_sum_col_it; +} + +inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row) +{ + Window win_vector_sum_row = get_win_vector_sum(window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + return vector_sum_row_it; +} + +inline Iterator get_bias_it(const Window &window, const ITensor *bias) +{ + Window win_bias(window); + win_bias.set(Window::DimY, Window::Dimension(0, 1, 1)); + win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1)); + Iterator bias_it(bias, win_bias); + return bias_it; +} + +template <typename VT> +inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, + const int32_t *vector_sum_row_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32x4_t result_offset_s32, + const int32x4_t result_shift_s32, + typename VT::vtype min_vec, + typename VT::vtype max_vec, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + int32_t multiplier, + int32_t shift, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_b_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) +{ + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset_s32); + } + if (has_a_offset && has_b_offset) + { + offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset)); + } + if (has_b_offset) + { + offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset)); + } + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if (has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if (has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if (!is_fixed_point || has_b_offset) + { + in_s32 = add_s32(in_s32, offset_term_s32); + } + if (!is_fixed_point) + { + in_s32 = mul_s32(in_s32, multiplier); + } + + if (is_fixed_point) + { + wrapper::vstore( + reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); + } + else + { + wrapper::vstore( + reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); + } + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t in_value = + *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if (has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if (has_bias) + { + in_value += *(bias_ptr + x); + } + + if (is_fixed_point) + { + // Finalize and store the result + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = + finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound), + static_cast<typename VT::stype>(max_bound), is_bounded_relu); + } + else + { + // Finalize quantization + in_value = (in_value * multiplier) >> shift; + + // Bound and store the result + if (is_bounded_relu) + { + in_value = static_cast<typename VT::stype>( + std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + } + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = + static_cast<typename VT::stype>(std::max<int32_t>( + static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()), + std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value))); + } + } +} + +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32_t *result_multipliers, + const int32_t *result_shifts, + const int32x4_t result_offset, + int8x16_t min_s8, + int8x16_t max_s8, + int32_t a_offset, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) +{ + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset); + } + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if (has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if (has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if (!is_fixed_point) + { + in_s32 = add_s32(in_s32, offset_term_s32); + in_s32 = mul_s32(in_s32, result_multipliers + x); + } + + if (is_fixed_point) + { + vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), + finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), + result_offset, min_s8, max_s8, is_bounded_relu)); + } + else + { + vst1q_s8( + reinterpret_cast<int8_t *>(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); + } + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t in_value = + *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if (has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if (has_bias) + { + in_value += *(bias_ptr + x); + } + + if (is_fixed_point) + { + // Finalize and store the result + *(out_it.ptr() + x) = + finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, + static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu); + } + else + { + // Finalize quantization + in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); + + // Bound and store the result + if (is_bounded_relu) + { + in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + } + *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value))); + } + } +} + +template <typename T> +void run_offset_contribution_output_stage(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) +{ + // Semantics of XYZW Explained for each tensor + // + // | Tensor | XYZW when is_gemm3d == false | XYZW when is_gemm3d == true | + // ------------------------------------------------------------------------------------------------------------------- + // | mm_result | x -> width, y -> height, z -> batch | x -> width, y -> height, z -> depth, w -> batch | + // | collapsed window | x -> width, y -> height, z -> batch | x -> width, y -> height, z -> depth * batch | + // | vector_sum_row | x -> height, y -> batch | x -> height * depth, y -> batch | + // | Vector_sum_col | x -> width, y -> batch | x -> width, y -> batch | + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + using Typer = VectorTyper<T>; + + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t multiplier = output_stage.gemmlowp_multiplier; + const int32_t shift = output_stage.gemmlowp_shift; + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const auto min_vec = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{}); + const auto max_vec = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{}); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if ((a_offset != 0) && (b_offset != 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched in y dimension + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), + mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, + k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, true, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, + result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false, + is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); + } + } + else if ((a_offset == 0) && (b_offset != 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + false, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_row_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_row_it, mm_result_it, out_it); + } + } + else if ((a_offset != 0) && (b_offset == 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched in y dimension + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + true, false, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>( + nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, + shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false, + true, is_bounded_relu, is_fixed_point); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>( + nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, + max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, + is_fixed_point); + }, + mm_result_it, out_it); + } + return; + } +} + +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); + + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if (a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched in y dimension + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, + window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if (bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, + is_fixed_point); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32, + min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, false, false, is_bounded_relu, is_fixed_point); + }, + mm_result_it, out_it); + } + return; + } +} + +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + if (output->data_type() != DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && + b_offset != 0); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && + output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if (a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->num_dimensions() > 2); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if (b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = output->tensor_shape(); + if (output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if (a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); + } + } + + // Check Tensor Rank of vector_sum_row + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3); + } + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); + } + + return Status{}; +} +} // namespace + +void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); + + _a_offset = a_offset; + _b_offset = b_offset; + _k = k; + _output_stage = output_stage; + + // If a_offset == 0, vector_sum_col can be a nullptr + if (a_offset != 0) + { + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _is_vector_sum_col_batched = vector_sum_col->tensor_shape().num_dimensions() > 1; + } + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8)); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps()); + + // Note: This kernel performs 16 elements per iteration. + // However, since we use a left-over for loop, we cannot have any read or write out of memory + // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped + ICpuKernel::configure(win); +} + +Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); + return Status{}; +} + +void CpuGemmLowpOffsetContributionOutputStageKernel::set_a_offset(int32_t a_offset) +{ + _a_offset = a_offset; +} + +void CpuGemmLowpOffsetContributionOutputStageKernel::set_b_offset(int32_t b_offset) +{ + _b_offset = b_offset; +} + +void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto mm_result = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_3); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); + int32_t type_min_int = type_min.get<int32_t>(); + int32_t type_max_int = type_max.get<int32_t>(); + + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + + const bool is_bounded_relu = + !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); + + // Check if we need to perform fixed point requantization + const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; + + // Check if symmetric per-channel execution + const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED; + + // Check if symmetric per-channel execution + const bool is_symm = _output_stage.is_quantized_per_channel; + + auto k_offset = _a_offset * _b_offset * _k; + if (is_symm) + { + run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, + _a_offset, _b_offset, k_offset, _is_vector_sum_col_batched, + _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + if (is_signed) + { + run_offset_contribution_output_stage<int8_t>( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + run_offset_contribution_output_stage<uint8_t>( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + } +} + +const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const +{ + return "CpuGemmLowpOffsetContributionOutputStageKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..ff706ff3dc --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2019-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel. + * + * The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. + * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. + * + * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: + * + * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift + * + * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: + * + * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * and mm_result'[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + */ + +class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel<CpuGemmLowpOffsetContributionOutputStageKernel> +{ +public: + /** Default constructor */ + CpuGemmLowpOffsetContributionOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel inputs and output. + * + * @param[in] mm_result Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B. + * Can be a 1D or 2D tensor, in case of 2D, y dim is the batch dimension + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A. + * Can be a 1D or 2D tensor, in case of 2D, y dim is the batch dimension + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. + */ + void configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + + /** Set the a offset + * Warning: if a_offset is non-zero then vector_sum_col must be set in run_op. + * Run configure or validate again if you aren't sure + * + * @param[in] a_offset Offset to be added to each element of the matrix A. + */ + void set_a_offset(int32_t a_offset); + + /** Set the b offset + * Warning: if b_offset is non-zero then vector_sum_col must be set in run_op. + * Run configure or validate again if you aren't sure + * + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void set_b_offset(int32_t b_offset); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Function to use for the particular tensors passed to configure() */ + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k{0}; // Number of columns of A or rows of B, used in last offset term + bool _is_vector_sum_col_batched{true}; + GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp new file mode 100644 index 0000000000..eefc294700 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) || + output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + + // Check biases if exist + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if (dst->total_size() != 0) + { + if (dst->data_type() != output_stage->output_data_type && + (output_stage->output_data_type == DataType::QASYMM8 || + output_stage->output_data_type == DataType::QASYMM8_SIGNED)) + { + ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) +{ + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); + + // Multiply by result_mult_int + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); +} + +template <typename T> +inline + typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type + convert_to_8bit(const int16x8x2_t in_s16) +{ + return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type +convert_to_8bit(const int16x8x2_t in_s16) +{ + return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); +} + +template <typename T> +inline typename wrapper::traits::neon_vector<T, 16>::type +finalize_quantization(int32x4x4_t &in_s32, + int32x4_t result_shift_s32, + typename wrapper::traits::neon_vector<T, 16>::type min, + typename wrapper::traits::neon_vector<T, 16>::type max) +{ + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; + + // Convert S16 to S8 or U8 + typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16); + + out = wrapper::vmax(out, min); + out = wrapper::vmin(out, max); + + return out; +} +} // namespace + +template <typename T> +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) +{ + using VectorType = typename wrapper::traits::neon_vector<T, 16>::type; + + const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->gemmlowp_shift); + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest(); + const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max(); + + VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{}); + VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{}); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + if (bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), + finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x); + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + + // Quantize + in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * + _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, bias_i, out); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), + finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + + // Quantize + in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); + + _output_stage = output_stage; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); + + // Check if we need to clamp the result using min and max + _is_bounded_relu = + ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) && + !(_output_stage->gemmlowp_min_bound == + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) && + _output_stage->gemmlowp_max_bound == + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); + if (_output_stage->output_data_type == DataType::QASYMM8) + { + _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>; + } + else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED) + { + _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>; + } + else + { + ARM_COMPUTE_ERROR("Data type not supported"); + } +} + +Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ScaleKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h new file mode 100644 index 0000000000..33e296b251 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * -# -to the [0..255] range and cast to QASYMM8. + * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. + * + */ +class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ScaleKernel> +{ +public: + CpuGemmLowpQuantizeDownInt32ScaleKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ScaleKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[out] output_stage GEMMLowp output stage metadata. + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel + * + * @param[in] src Input tensor info + * @param[in] bias Biases tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()) + */ + template <typename T> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ScaleKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Biases tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window); + + QuantizeDownFunctionPtr _func{nullptr}; + const GEMMLowpOutputStageInfo *_output_stage{nullptr}; + bool _is_bounded_relu{false}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..a5c09c9977 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NESymm.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) +{ + const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min)); + const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max)); + + ARM_COMPUTE_UNUSED(min_s16); + ARM_COMPUTE_UNUSED(max_s16); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if (bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}}; + + const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, + finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}}; + + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, + finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + ARM_COMPUTE_UNUSED(in_value); + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min, + int max) +{ + // Perform validate step + ARM_COMPUTE_UNUSED(bias, dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _min = min; + _max = max; + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*src, src->clone()->set_data_type(DataType::QSYMM16)); + // Configure kernel window + Window win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= -32768 && max >= 32767); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..925788b680 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. + * + */ +class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel> +{ +public: + CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QSYMM16 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + */ + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min = 0, + int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _min{0}; + int _max{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..0e58097073 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); + const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(_min)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(_max)); + + ARM_COMPUTE_UNUSED(min_s8, max_s8); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if (bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _result_offset_after_shift = result_offset_after_shift; + _min = min; + _max = max; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8_SIGNED)); + + // Configure kernel window + Window win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= -128 && max >= 127); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..6a67ba4f19 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. + * + */ +class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel> +{ +public: + CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8_SIGNED + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..e3dd2240ca --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max)); + + ARM_COMPUTE_UNUSED(min_u8); + ARM_COMPUTE_UNUSED(max_u8); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if (bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast<uint8_t>(_min), + static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast<uint8_t>(_min), + static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _result_offset_after_shift = result_offset_after_shift; + _min = min; + _max = max; + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8)); + + // Configure kernel window + auto win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= 0 && max >= 255); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..45bd742a70 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel> +{ +public: + CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..fb1b70b91f --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/cpu/kernels/gemm_matrix_add/list.h" +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = { + {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)}, + {"neon_fp16_gemm_matrix_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)}, + +}; +} // namespace + +void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta) +{ + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta)); + + _beta = beta; + const auto uk = CpuGemmMatrixAdditionKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + _func = uk->ukernel; + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + ICPPKernel::configure(win); +} + +Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_UNUSED(beta); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + return Status{}; +} + +void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + if (_beta != 0.0f) + { + (*_func)(src, dst, window, _beta); + } +} + +const char *CpuGemmMatrixAdditionKernel::name() const +{ + return "CpuGemmMatrixAdditionKernel"; +} + +const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> & +CpuGemmMatrixAdditionKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h new file mode 100644 index 0000000000..5e12f1dcbd --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size + * + * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have: + * - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel + * - MTX_1 = C + */ +class CpuGemmMatrixAdditionKernel : public ICpuKernel<CpuGemmMatrixAdditionKernel> +{ +private: + using GemmMatrixAddKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, float)>::type; + +public: + struct GemmMatrixAddKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + GemmMatrixAddKernelPtr ukernel; + }; + CpuGemmMatrixAdditionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixAdditionKernel); + /** Initialise the kernel's input and output. + * + * @note The input and output tensor must have the same dimensions + * + * @param[in] src Input tensor info (Matrix C). Data types supported: F16/F32 + * @param[in, out] dst Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref CpuGemmMatrixMultiplyKernel. Data type supported: the same as @p src. + * @param[in] beta Weight of matrix C + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixAdditionKernel. + * + * @note The input and output tensor must have the same dimensions + * + * Similar to @ref CpuGemmMatrixAdditionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + static const std::vector<GemmMatrixAddKernel> &get_available_kernels(); + +private: + /** Common signature for all the matrix addition functions + * + * @param[in] src An input tensor. Data types supported: F16/F32 + * @param[out] dst The output tensor. Data type supported: same as @p src + * @param[in] window Region on which to execute the kernel. + * @param[in] beta Weight of matrix C + */ + /** Matrix addition function to use for the particular tensor types passed to configure() */ + GemmMatrixAddKernelPtr _func{nullptr}; + float _beta{0.f}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..beccd94844 --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/gemm_matrix_mul/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = { + {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)}, + {"neon_fp16_gemm_matrix_mul", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)}, +}; + +inline Status validate_arguments(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_UNUSED(alpha); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); + + if (!is_interleaved) + { + ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1)); + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + /* Interleave */ + TensorShape tensor_shape0{lhs->tensor_shape()}; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape( + misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0); + + if (n != 0) /* Transpose */ + { + TensorShape tensor_shape1{rhs->tensor_shape()}; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape( + tensor_info1, mult_transpose1xW_width)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1); + } + + if (dst->total_size() != 0) + { + if (n != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n)); + } + ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast<size_t>(m)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + } + } + + return Status{}; +} + +} // namespace + +void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // dst tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{lhs->tensor_shape()}; + tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0)); + tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1)); + + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape)); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); + + _alpha = alpha; + + // Configure kernel window + Window win{}; + + // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication + const bool is_dst_vector = (dst->dimension(1) == 1); + if (is_dst_vector) + { + const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32; + + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); + } + else + { + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + } + + const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation( + DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + _func = uk->ukernel; + + ICPPKernel::configure(win); +} + +Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); + + return Status{}; +} + +void CpuGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const ITensor *lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + const bool is_dst_vector = (dst->info()->dimension(1) == 1); + (*_func)(lhs, rhs, dst, window, info, _alpha, is_dst_vector); +} + +const char *CpuGemmMatrixMultiplyKernel::name() const +{ + return "CpuGemmMatrixMultiplyKernel"; +} + +const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> & +CpuGemmMatrixMultiplyKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h new file mode 100644 index 0000000000..765fcb8275 --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication + * + * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p lhs and @p rhs are both matrices and reshaped respectively with @ref CpuGemmInterleave4x4Kernel" and @ref CpuGemmTranspose1xWKernel + * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped + * + */ +class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel> +{ +private: + using GemmMatrixMulKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type; + +public: + struct GemmMatrixMulKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + GemmMatrixMulKernelPtr ukernel; + }; + + CpuGemmMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixMultiplyKernel); + /** Initialise the kernel's input and output. + * + * @note If the output tensor is a matrix, the input matrices @p lhs and @p rhs should be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel + * These two kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] lhs Left-handside tensor info containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 + * @param[in] rhs Right-handside tensor info containing the transposed Matrix B if the first input tensor A is not a vector. + * If the output tensor is a vector, rhs must contain the matrix B not reshaped. Data type supported: same as @p lhs + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p lhs. + * @param[in] alpha Weight of the matrix product + * @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel + * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped + */ + void configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel + * + * Similar to @ref CpuGemmMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + static const std::vector<GemmMatrixMulKernel> &get_available_kernels(); + +private: + /** Common signature for all the matrix multiply functions + * + * @param[in] lhs Left-handside input tensor. Data types supported: F16/F32 + * @param[in] rhs Right-handside input tensor. Data types supported: same as @p lhs + * @param[out] dst The output tensor. Data type supported: same as @p rhs + * @param[in] window Region on which to execute the kernel. + * @param[in] info Thread info metadata. + * @param[in] alpha Weight of the matrix product. + */ + + /** Matrix multiply function to use for the particular tensor types passed to configure() */ + GemmMatrixMulKernelPtr _func{nullptr}; + float _alpha{1.f}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp new file mode 100644 index 0000000000..c47746bc4b --- /dev/null +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuGemmTranspose1xWKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*src))); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmTranspose1xWKernel::validate(src, dst)); + + const size_t vector_size = 16 / src->element_size(); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(vector_size)); + ICPPKernel::configure(win); +} + +Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_transpose1xW_with_element_size_shape(*src)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + /* + * Following an example of how the transposition1xW works when the src data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * The dst matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + */ + + // Set window for dst tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + Iterator in(src, window); + Iterator out(dst, win_out); + + const size_t in_width = src->info()->dimension(0); + const size_t element_size = src->info()->element_size(); + const size_t out_stride = dst->info()->strides_in_bytes()[1]; + const size_t vector_size = 16 / element_size; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = + out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; + + for (size_t k = 0; k < vector_size; ++k) + { + // If the src width is not multiple of W, we fill the reference with 0s + if ((id.x() + k) >= in_width) + { + std::memset(out_ptr + k * element_size, 0, element_size); + } + else + { + std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + } + } + }, + in, out); +} + +const char *CpuGemmTranspose1xWKernel::name() const +{ + return "CpuGemmTranspose1xWKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h new file mode 100644 index 0000000000..4b834b2cc6 --- /dev/null +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor) + * + * Following an example of how the transposition1xW works when the input data is F32 + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + * + * Following an example of how the transposition1xW works when the input data type is F16 + * + * @f[ + * \left( \begin{array}{cccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\ + * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\ + * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\ + * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\ + * \end{array} \right) + * @f] + * + * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * + */ +class CpuGemmTranspose1xWKernel : public ICpuKernel<CpuGemmTranspose1xWKernel> +{ +public: + CpuGemmTranspose1xWKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmTranspose1xWKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Input tensor info. Data types supported: All + * @param[out] dst Output tensor info. Data type supported: same as @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmTranspose1xWKernel + * + * Similar to @ref CpuGemmTranspose1xWKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H */ diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp new file mode 100644 index 0000000000..39ba764c78 --- /dev/null +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuIm2ColKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/list.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <tuple> + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace cpu +{ +namespace kernels +{ +void run_im2col_fp32_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, true, false>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_fp32_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, false, false>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +#if defined(ARM_COMPUTE_ENABLE_BF16) +void run_im2col_bf16_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_bf16_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + +void run_im2col_int8_nopad_nhwc(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<int8_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_uint8_nopad_nhwc(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<uint8_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_pad_nhwc(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void internal_run_im2col_fp16_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +/* + Note that when building with the option data_type_support=fp32 the fp16.cpp files won't be compiled and the linker + would fail with the error undefined arm_compute::cpu::kernels::run_im2col_fp16_pad. + To avoid this problem we only call to the actual fp16 kernel if ENABLE_FP16_KERNELS is defined. +*/ +#if defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col_fp16_pad(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(ENABLE_FP16_KERNELS) +} + +void internal_run_im2col_fp16_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col_fp16_nopad(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(ENABLE_FP16_KERNELS) +} + +void internal_run_im2col_fp16_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col_fp16_nchw_pad(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(ENABLE_FP16_KERNELS) +} + +void internal_run_im2col_fp16_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col_fp16_nchw_nopad(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(ENABLE_FP16_KERNELS) +} + +namespace +{ +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); + ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon"); + + // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions + const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); + const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); + ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); + + if (output->total_size() > 0) + { + TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape( + input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + } + + return Status{}; +} +} // namespace + +void CpuIm2ColKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_UNUSED(num_groups); + + _data_layout = src->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + + _conv_info = conv_info; + _kernel_width = kernel_dims.width; + _kernel_height = kernel_dims.height; + _input_pad_right = input_pad_right; + _dilation = dilation; + _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width, + _kernel_height, _conv_info, _dilation); + _has_bias = has_bias; + + if (_data_layout == DataLayout::NCHW) + { + switch (src->data_type()) + { + case DataType::F32: + _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nchw_nopad : &run_im2col_fp32_nchw_pad; + break; + case DataType::F16: + _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nchw_nopad + : &internal_run_im2col_fp16_nchw_pad; + break; +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nchw_nopad : &run_im2col_bf16_nchw_pad; + break; +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + case DataType::QASYMM8_SIGNED: + case DataType::QASYMM8: + _func = (!conv_info.has_padding()) ? &run_im2col_qasymm8_nchw_nopad : &run_im2col_qasymm8_nchw_pad; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + else + { + switch (src->data_type()) + { + case DataType::F32: + _func = (!conv_info.has_padding()) ? &run_im2col_fp32_nopad : &run_im2col_fp32_pad; + break; + case DataType::F16: + _func = (!conv_info.has_padding()) ? &internal_run_im2col_fp16_nopad : &internal_run_im2col_fp16_pad; + break; +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + _func = (!conv_info.has_padding()) ? &run_im2col_bf16_nopad : &run_im2col_bf16_pad; + break; +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + case DataType::QASYMM8: + _func = (!conv_info.has_padding()) ? &run_im2col_uint8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc; + break; + case DataType::QASYMM8_SIGNED: + _func = (!conv_info.has_padding()) ? &run_im2col_int8_nopad_nhwc : &run_im2col_qasymm8_pad_nhwc; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, + false, num_groups, _input_pad_right))); + + std::pair<unsigned int, unsigned int> convolved_dims = + scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height, + conv_info, dilation); + + Window win = calculate_max_window(*src, Steps()); + win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1)); + win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1)); + win.set(channel_idx, Window::Dimension(0, 1, 1)); + // Configure kernel window + ICpuKernel::configure(win); +} + +Status CpuIm2ColKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + return Status{}; +} + +void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _func(src, dst, window, _data_layout, _conv_info, _convolved_dims, Size2D(_kernel_width, _kernel_height), _dilation, + _input_pad_right, _has_bias); +} + +const char *CpuIm2ColKernel::name() const +{ + return "CpuIm2ColKernel"; +} + +size_t CpuIm2ColKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h new file mode 100644 index 0000000000..ae7162cccf --- /dev/null +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H + +#include "arm_compute/core/Size2D.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Interface for the im2col reshape kernel. + * + * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. + * It is used to transform a convolution to a plain matrix multiplication. + * + * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ + * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ + * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ + * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + */ +class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel> +{ +public: + /** Default constructor */ + CpuIm2ColKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32 + * Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false + * @param[out] dst The output tensor info. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary + */ + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuIm2ColKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requsted configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + +private: + /** Common signature for all the specialised im2col functions + * + * @param[in] window Region on which to execute the kernel. + */ + using Im2ColFunctionPtr = void (*)(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias); + + Im2ColFunctionPtr _func{nullptr}; + std::pair<unsigned int, unsigned int> _convolved_dims{}; + PadStrideInfo _conv_info{}; + unsigned int _kernel_width{0}; + unsigned int _kernel_height{0}; + unsigned int _input_pad_right{0}; + bool _has_bias{false}; + Size2D _dilation{1U, 1U}; + DataLayout _data_layout{DataLayout::UNKNOWN}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUIM2COLKERNEL_H diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h new file mode 100644 index 0000000000..7c1e4772a6 --- /dev/null +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H +#define ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H + +#include "arm_compute/core/Types.h" + +#include "src/common/cpuinfo/CpuIsaInfo.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +// Selector data types +struct DataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; +}; + +struct DataTypeDataLayoutISASelectorData +{ + DataType dt; + DataLayout dl; + const cpuinfo::CpuIsaInfo &isa; +}; + +struct CastDataTypeISASelectorData +{ + DataType src_dt; + DataType dst_dt; + const cpuinfo::CpuIsaInfo &isa; +}; + +struct PoolDataTypeISASelectorData +{ + DataType dt; + DataLayout dl; + int pool_stride_x; + Size2D pool_size; + cpuinfo::CpuIsaInfo isa; +}; + +struct ElementwiseDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + int op; +}; +struct DepthwiseConv2dNativeDataTypeISASelectorData +{ + DataType weights_dt; + DataType source_dt; + const cpuinfo::CpuIsaInfo &isa; +}; + +struct ActivationDataTypeISASelectorData +{ + DataType dt; + const CPUModel &cpumodel; + const cpuinfo::CpuIsaInfo &isa; + const ActivationFunction f; +}; + +struct CpuAddKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool can_use_fixedpoint; +}; + +struct ScaleKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + InterpolationPolicy interpolation_policy; +}; + +struct SoftmaxKernelDataTypeISASelectorData +{ + DataType dt; + cpuinfo::CpuIsaInfo isa; + bool is_log; + int axis; + unsigned long sme2_vector_length; +}; + +// Selector pointer types +using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type; +using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type; +using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type; +using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type; +using DepthwiseConv2dNativeDataTypeISASelectorPtr = + std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type; +using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type; +using ActivationDataTypeISASelectorDataPtr = + std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type; +using CpuAddKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type; +using ScaleKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; +using SoftmaxKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const SoftmaxKernelDataTypeISASelectorData &data)>::type; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_CPUKERNELSELECTIONTYPES_H diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp new file mode 100644 index 0000000000..bcaa76b99b --- /dev/null +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/maxunpool/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace misc::shape_calculator; + +namespace +{ +static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = { + {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(neon_fp32_maxunpooling)}, + {"neon_fp16_maxunpooling", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_maxunpooling)}, + {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)}, + {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)}, +}; + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices); + + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; + const Size2D pool_size(pool_size_x, pool_size_y); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info)); + ARM_COMPUTE_UNUSED(indices); + + const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + _run_method = uk->ukernel; + + const TensorShape output_shape = compute_unpool_shape(*src, pool_info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); + + auto window = calculate_max_window(*src, Steps()); + ICpuKernel::configure(window); +} + +Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info)); + return Status{}; +} + +void CpuMaxUnpoolingLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto indices = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, indices, dst, window); +} + +const char *CpuMaxUnpoolingLayerKernel::name() const +{ + return "CpuMaxUnpoolingLayerKernel"; +} + +const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> &CpuMaxUnpoolingLayerKernel::get_available_kernels() +{ + return available_kernels; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h new file mode 100644 index 0000000000..5a641a2bea --- /dev/null +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H +#define ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the pooling layer kernel */ +class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel> +{ +private: + using MaxUnpoolingUKernelPtr = std::add_pointer<void( + const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type; + +public: + /** Default constructor */ + CpuMaxUnpoolingLayerKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMaxUnpoolingLayerKernel); + + /** Configure kernel for a given list of arguments + * + * @note Dst shape must be equal to the shape of the original src to pool. + * + * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] indices Tensor containing the offset to store the src elements in the dst tensor. + * @ref CpuMaxUnpooling with indices should precede this function in order to + * properly reconstruct the output tensor. + * The tensor shape of this tensor has to be equal to the src tensor shape. Data type supported: U32. + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel + * + * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] indices Tensor info of the indices of the maximal values. Data type supported: U32. + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + struct MaxUnpoolingKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + MaxUnpoolingUKernelPtr ukernel; + }; + + static const std::vector<MaxUnpoolingKernel> &get_available_kernels(); + + const char *name() const override; + +private: + MaxUnpoolingUKernelPtr _run_method{nullptr}; +}; + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPUMAXUNPOOLINGLAYERKERNEL_H */ diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp new file mode 100644 index 0000000000..8001482154 --- /dev/null +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -0,0 +1,1831 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuMulKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/mul/generic/neon/list.h" + +#include <arm_neon.h> + +namespace +{ +#if defined(ENABLE_FP32_KERNELS) +static constexpr size_t default_mws_N1_fp32_neon = 22447; +static constexpr size_t default_mws_V1_fp32_neon = 38982; +#endif /* ENABLE_FP32_KERNELS */ +static constexpr size_t default_mws_other_platforms_1d_tensor = 10240; +} // namespace +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +const float scale255_constant = 1.f / 255.f; +const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); +const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); + +inline Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::S32, DataType::F16, DataType::F32); + if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, + "ConvertPolicy cannot be WRAP if datatype is quantized"); + } + + if (dst->total_size() > 0) + { + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + // clang-format off + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) + , "Invalid data type combination"); + // clang-format on + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && + scale != 1.f, + "Unsupported scale for QSYMM16 inputs and S32 dst"); + } + + if (std::abs(scale - scale255_constant) < 0.00001f) + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && + rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && + dst->data_type() == DataType::S32, + "Scale == 1/255 is not supported if input and dst are of data type S32"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); + + int exponent = 0; + const float normalized_mantissa = std::frexp(scale, &exponent); + + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), + "Scale value not supported (Should be 1/(2^n) or 1/255"); + } + + return Status{}; +} + +/* Scales a given vector by 1/255. + * + * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. + * + * @param in Input vector to scale. + * @return Scaled dst rounded to nearest (round half up). + */ +inline int32x4_t scale255_S32_S32(int32x4_t in) +{ + // Scale + const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); + // Round to nearest (round half up) + // Add +0.5 for all values + // Afterwards vcvt rounds toward zero + return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); +} + +inline uint16x8_t scale255_U16_U16(uint16x8_t in) +{ + const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); + const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); + return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize_signed(val, info); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize(val, info); +} + +template <typename T> +void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type; + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); + + const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); + const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(non_broadcast_input_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto input1_q = wrapper::vloadq(input1_ptr + x); + const auto input2_q = wrapper::vloadq(input2_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(input1_ptr + x); + const T src2 = *(input2_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); + } +} + +bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + float scale) +{ + const auto iq0 = src0->quantization_info().uniform(); + const auto iq1 = src1->quantization_info().uniform(); + const auto oq = dst->quantization_info().uniform(); + + const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale; + + if (multiplier < -8191.f || multiplier > 8191.f) + { + //The multiplier cannot be stored as a 14.18 signed fixed-point number + return false; + } + + const auto offset_out = float(oq.offset); + + const auto max_result = multiplier * (256) * (256) + offset_out; + + if (max_result > 8191.f) + { + //It might not be possible to store the result as a 14.18 signed fixed-point number. + return false; + } + + return true; +} + +template <typename ScalarType> +void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale) +{ + const auto in0_info = src0->info(); + const auto in1_info = src1->info(); + + const auto &in0_shape = in0_info->tensor_shape(); + const auto &in1_shape = in1_info->tensor_shape(); + + // Create input windows. + Window in0_win = window.broadcast_if_dimension_le_one(in0_shape); + Window in1_win = window.broadcast_if_dimension_le_one(in1_shape); + + // Clear the x dimension on the execution window as we process the whole row each iteration. + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16; + const auto window_start_x = window.x().start(); + const auto window_end_x = window.x().end(); + const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x(); + + const auto iq0_info = in0_info->quantization_info().uniform(); + const auto iq1_info = in1_info->quantization_info().uniform(); + const auto oq_info = dst->info()->quantization_info().uniform(); + + const auto in0_offset = iq0_info.offset; + const auto in1_offset = iq1_info.offset; + const auto out_offset = oq_info.offset; + const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale; + + constexpr int32_t two_pwr18i = 262144; + constexpr float two_pwr18f = 262144.f; + + const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset); + const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset); + const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i); + const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f); + + if (is_broadcast_across_x) + { + // Prefix: a = non-broadcast, b = broadcast. + + const auto is_broadcast_input_1 = in1_win.x().step() == 0; + auto a_win = is_broadcast_input_1 ? in0_win : in1_win; + auto b_win = is_broadcast_input_1 ? in1_win : in0_win; + const auto a_tensor = is_broadcast_input_1 ? src0 : src1; + const auto b_tensor = is_broadcast_input_1 ? src1 : src0; + + const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0; + const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset; +#ifndef __aarch64__ + const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset; + const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset; +#endif //__aarch64__ + const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag()); + + // Clear the x dimension on the execution window as we process the whole row each iteration. + a_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator a_input_it(a_tensor, a_win); + Iterator b_input_it(b_tensor, b_win); + Iterator out_it(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + const auto b_val = *b_ptr; + const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0); + const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag()); + + const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); + const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0); + const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01); + + const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } + + //Process the left-over elements. + for (; x < window_end_x; ++x) + { +#ifdef __aarch64__ + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>( + (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) + + out_offset_14p18))); +#else //__aarch64__ + out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround( + multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset))); +#endif //__aarch64__ + } + }, + a_input_it, b_input_it, out_it); + } + else + { + const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag()); + const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag()); + const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); + const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); + + // Clear the x dimension on the execution window as we process the whole row each iteration. + in0_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + in1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in0_it(src0, in0_win); + Iterator in1_it(src1, in1_win); + Iterator out_it(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0); + const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0); + + const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0); + const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01); + + const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } + + //Process the left-over elements. + for (; x < window_end_x; ++x) + { +#ifdef __aarch64__ + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>( + wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * + (int32_t(in1_ptr[x]) - in1_offset_16p0)) + + out_offset_14p18))); +#else //__aarch64__ + out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround( + multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + + float(out_offset))); +#endif //__aarch64__ + } + }, + in0_it, in1_it, out_it); + } +} + +void mul_saturate_QSYMM16_QSYMM16_QSYMM16( + const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const qsymm16x8x2_t input2_q = {{ + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + }}; + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale; + float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale; + float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst, lrintf() has same rounding mode as vcombine_s16 + int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); + qsymm16_t tmp_qua = + static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); +} + +void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) +{ + ARM_COMPUTE_UNUSED(scale); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const qsymm16x8x2_t input2_q = {{ + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + }}; + + const int32x4x4_t in1_s32 = {{ + vmovl_s16(vget_low_s16(input1_q.val[0])), + vmovl_s16(vget_high_s16(input1_q.val[0])), + vmovl_s16(vget_low_s16(input1_q.val[1])), + vmovl_s16(vget_high_s16(input1_q.val[1])), + }}; + const int32x4x4_t in2_s32 = {{ + vmovl_s16(vget_low_s16(input2_q.val[0])), + vmovl_s16(vget_high_s16(input2_q.val[0])), + vmovl_s16(vget_low_s16(input2_q.val[1])), + vmovl_s16(vget_high_s16(input2_q.val[1])), + }}; + + const int32x4x4_t result = {{ + vmulq_s32(in1_s32.val[0], in2_s32.val[0]), + vmulq_s32(in1_s32.val[1], in2_s32.val[1]), + vmulq_s32(in1_s32.val[2], in2_s32.val[2]), + vmulq_s32(in1_s32.val[3], in2_s32.val[3]), + }}; + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + vst1q_s32(output_ptr + x + 8, result.val[2]); + vst1q_s32(output_ptr + x + 12, result.val[3]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + *(output_ptr + x) = tmp; + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); + const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); + + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + + if (is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if (is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + if (is_sat) + { + vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); + } + else + { + vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<uint16_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + if (is_sat && tmp > 255) + { + tmp = 255; + } + *(output_ptr + x) = static_cast<uint8_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n) +{ + int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1)); + const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2)); + int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1)); + const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2)); + + tmp1_high = vmulq_s32(tmp1_high, tmp2_high); + tmp1_low = vmulq_s32(tmp1_low, tmp2_low); + + if (is_scale255) + { + tmp1_high = scale255_S32_S32(tmp1_high); + tmp1_low = scale255_S32_S32(tmp1_low); + } + else + { + // Right shift amount + const int32x4_t vn = vdupq_n_s32(-n); + // Left shift amount + const int32x4_t vnl = vdupq_n_s32(n); + // Calculate conversion bit + const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); + const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); + const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); + const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); + const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); + const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); + const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); + const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); + if (is_sat) + { + tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + else + { + tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + } + + if (is_sat) + { + return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); + } + else + { + return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); + } +} + +template <bool is_scale255, bool is_sat> +inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) +{ + const int16x8x2_t result = {{// First 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}}; + + return result; +} + +template <bool is_scale255, bool is_sat> +void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const int16x8x2_t ta2 = {{ + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + }}; + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } + } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_sat> +inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n) +{ + const int32x2_t input1_1 = vget_low_s32(src1); + const int32x2_t input2_1 = vget_low_s32(src2); + const int32x2_t input1_2 = vget_high_s32(src1); + const int32x2_t input2_2 = vget_high_s32(src2); + + int64x2_t tmp_1 = vmull_s32(input1_1, input2_1); + int64x2_t tmp_2 = vmull_s32(input1_2, input2_2); + + // Apply scaling, conversion and rounding (round to zero) + // Right shift amount + const int64x2_t vn = vdupq_n_s64(-n); + // Left shift amount + const int64x2_t vnl = vdupq_n_s64(n); + // Calculate conversion bit + const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1); + const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63); + const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1); + const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s); + + const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2); + const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); + const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); + const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); + if (is_sat) + { + tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2)); + } + else + { + tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2)); + } +} + +template <bool is_sat> +inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) +{ + const int32x4x2_t result = {{// First 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n), + // Second 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}}; + + return result; +} + +template <bool is_sat> +void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t broadcast_v = {{ + broadcast_value_vec, + broadcast_value_vec, + }}; + const int32x4x2_t non_broadcast_v = {{ + vld1q_s32(non_broadcast_input_ptr + x), + vld1q_s32(non_broadcast_input_ptr + x + 4), + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int64_t tmp = + static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x)); + + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; + } + if (is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t ta1 = {{ + vld1q_s32(input1_ptr + x), + vld1q_s32(input1_ptr + x + 4), + }}; + const int32x4x2_t ta2 = {{ + vld1q_s32(input2_ptr + x), + vld1q_s32(input2_ptr + x + 4), + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x)); + + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; + } + if (is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); + } + }, + input1, input2, dst); + } +} + +void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 8 / sizeof(float); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type; + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); + float32x4_t b = vdupq_n_f32(broadcast_value); + + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); + const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); + auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); + auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); + float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); + + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + + b = wrapper::vrev64(b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto a0 = *(input1_ptr + 2 * x); + const auto a1 = *(input1_ptr + 2 * x + 1); + const auto b0 = *(input2_ptr + 2 * x); + const auto b1 = *(input2_ptr + 2 * x + 1); + auto res1 = a0 * b0 - a1 * b1; + auto res2 = a0 * b1 + a1 * b0; + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + input1, input2, dst); + } +} + +template <bool is_scale255, bool is_sat> +void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); + const uint8x16_t av = wrapper::vloadq(input1_ptr + x); + + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if (is_scale255) + { + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if (is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } + } + + if (is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + } + + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const uint8x8x2_t ta2u = {{ + vld1_u8(input2_ptr + x), + vld1_u8(input2_ptr + x + 8), + }}; + const int16x8x2_t ta2 = { + {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}}; + + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } + } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Simply swap the two input buffers + mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n); +} +} // namespace + +void CpuMulKernel::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(rounding_policy); + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + + _scale = scale; + _scale_exponent = 0; + _func_quantized = nullptr; + _func_int = nullptr; + _func_float = nullptr; + + bool is_scale_255 = false; + // Check and validate scaling factor + if (std::abs(scale - scale255_constant) < 0.00001f) + { + is_scale_255 = true; + } + else + { + int exponent = 0; + + std::frexp(scale, &exponent); + + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + _scale_exponent = std::abs(exponent - 1); + } + + const DataType dt_input1 = src1->data_type(); + const DataType dt_input2 = src2->data_type(); + const DataType dt_output = dst->data_type(); + const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); + + switch (dt_input1) + { + case DataType::QASYMM8: + if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) + { + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + { + _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>; + } + else + { + _func_quantized = &mul_saturate_quantized_8<uint8_t>; + } + } + break; + case DataType::QASYMM8_SIGNED: + if (dt_input2 == DataType::QASYMM8_SIGNED) + { + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + { + _func_quantized = &mul_q8_neon_fixedpoint<int8_t>; + } + else + { + _func_quantized = &mul_saturate_quantized_8<int8_t>; + } + } + break; + case DataType::QSYMM16: + if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) + { + _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; + } + else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) + { + _func_int = &mul_QSYMM16_QSYMM16_S32; + } + break; + case DataType::S16: + if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if (is_scale_255) + { + _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>; + } + } + if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if (is_scale_255) + { + _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>; + } + } + break; + case DataType::S32: + if (DataType::S32 == dt_input2 && DataType::S32 == dt_output) + { + _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>; + } + break; + case DataType::U8: + if (DataType::U8 == dt_input2 && DataType::U8 == dt_output) + { + if (is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>; + } + } + else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if (is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>; + } + } + else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if (is_scale_255) + { + _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>; + } + } + break; + case DataType::F16: + _func_float = REGISTER_FP16_NEON(cpu::mul_F16_F16_F16); + break; + case DataType::F32: + _func_float = REGISTER_FP32_NEON(cpu::mul_F32_F32_F32); + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + } + + // Configure kernel window + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2); + + ICpuKernel::configure(win); +} + +size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if (this->_func_float == &mul_F32_F32_F32) + { + size_t mws = ICPPKernel::default_mws; + if (platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if (platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + if (_split_dimension == Window::DimX) + { + // Don't split the work load too small if the tensor has been reinterpreted as 1D. + // This number is loosely chosen as threading overhead in each platform varies wildly. + return default_mws_other_platforms_1d_tensor; + } + return default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if (this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + if (_split_dimension == Window::DimX) + { + // Don't split the work load too small if the tensor has been reinterpreted as 1D. + // This number is loosely chosen as threading overhead in each platform varies wildly. + return default_mws_other_platforms_1d_tensor; + } + return default_mws; +} + +Status CpuMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + return Status{}; +} + +void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if (_func_quantized != nullptr) + { + (*_func_quantized)(src1, src2, dst, window, _scale); + } + else if (_func_int != nullptr) + { + (*_func_int)(src1, src2, dst, window, _scale_exponent); + } + else + { + ARM_COMPUTE_ERROR_ON(_func_float == nullptr); + (*_func_float)(src1, src2, dst, window, _scale); + } +} + +const char *CpuMulKernel::name() const +{ + return "CpuMulKernel"; +} + +namespace +{ +Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type()); + auto_init_if_empty(*dst, out_info); + + // Configure kernel window + Window win = calculate_max_window(out_shape); + + ICpuKernel::configure(win); +} + +Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst)); + + return Status{}; +} + +void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + c_mul_F32_F32_F32_n(src1, src2, dst, window); +} + +const char *CpuComplexMulKernel::name() const +{ + return "CpuComplexMulKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h new file mode 100644 index 0000000000..7eaf287507 --- /dev/null +++ b/src/cpu/kernels/CpuMulKernel.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H +#define ARM_COMPUTE_CPU_MUL_KERNEL_H + +#include "arm_compute/core/Rounding.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform multiplication between two tensors */ +class CpuMulKernel : public ICpuKernel<CpuMulKernel> +{ +public: + CpuMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel); + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (Src1,Src2) -> Dst : + * + * Support: Broadcast? Scale=1/255? + * - (U8,U8) -> U8, S16 N Y + * - (U8,S16) -> S16 N Y + * - (S16,U8) -> S16 N Y + * - (S16,S16) -> S16 N Y + * - (S32,S32) -> S32 Y N + * - (F16,F16) -> F16 N Y + * - (F32,F32) -> F32 Y Y + * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y + * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + */ + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); + + // Inherited methods overridden + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension hint. + */ + size_t get_split_dimension_hint() const + { + return _split_dimension; + } + +private: + /** Common signature for all the specialised multiplication functions with integer scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Integer scale factor. + */ + using MulFunctionInt = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); + /** Common signature for all the specialised multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + */ + using MulFunctionFloat = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + * + */ + using MulFunctionQuantized = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + + MulFunctionFloat *_func_float{nullptr}; + MulFunctionInt *_func_int{nullptr}; + MulFunctionQuantized *_func_quantized{nullptr}; + float _scale{0}; + int _scale_exponent{0}; + size_t _split_dimension{Window::DimY}; +}; + +/** Interface for the complex pixelwise multiplication kernel. */ +class CpuComplexMulKernel : public ICpuKernel<CpuComplexMulKernel> +{ +public: + CpuComplexMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel); + /** Initialise the kernel's src, dst and border mode. + * + * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */ diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp new file mode 100644 index 0000000000..b444a25ff7 --- /dev/null +++ b/src/cpu/kernels/CpuPermuteKernel.cpp @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuPermuteKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace +{ +#include "src/core/NEON/kernels/convolution/common/shims.hpp" +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +inline bool is_permutation_supported(const PermutationVector &v) +{ + static const std::array<PermutationVector, 2> permutations2 = {{ + PermutationVector(0U, 1U), + PermutationVector(1U, 0U), + }}; + static const std::array<PermutationVector, 6> permutations3 = {{ + PermutationVector(2U, 0U, 1U), + PermutationVector(1U, 2U, 0U), + PermutationVector(0U, 1U, 2U), + PermutationVector(0U, 2U, 1U), + PermutationVector(1U, 0U, 2U), + PermutationVector(2U, 1U, 0U), + }}; + static const std::array<PermutationVector, 24> permutations4 = { + {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U), + PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U), + PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U), + PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U), + PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U), + PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U), + PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U), + PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}}; + + return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || + (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || + (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported."); + + const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); + + // Validate configured destination + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +template <typename T> +void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm) +{ + const DataLayout src_layout = src->info()->data_layout(); + + // Source window + Window window_src = window; + + // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others + // we have to fall back to C++ + if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) || + (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})) + { + window_src.set(Window::DimX, + Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); + window_src.set(Window::DimY, + Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); + window_src.set(Window::DimZ, + Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); + window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start())); + } + + // Destination window + Window window_dst(window); + const Window::Dimension zero_window = Window::Dimension(0, 0, 0); + for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d) + { + window_dst.set(d, zero_window); + } + + // Create iterators + Iterator src_it(src, window_src); + Iterator dst_it(dst, window_dst); + + int in_row_stride = 0; + int in_col_stride = 0; + int in_channel_stride = 0; + int in_batch_stride = 0; + int n_cols = 0; + int n_rows = 0; + int n_channels = 0; + int n_batches = 0; + + switch (src_layout) + { + case DataLayout::NCHW: + { + in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T); + in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T); + in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + n_cols = src->info()->tensor_shape().x(); + n_rows = window_src.y().step(); + n_channels = src->info()->tensor_shape().z(); + n_batches = src->info()->tensor_shape()[3]; + break; + } + case DataLayout::NHWC: + { + in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T); + in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T); + in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + n_channels = src->info()->tensor_shape().x(); + n_cols = window_src.y().step(); + n_rows = src->info()->tensor_shape().z(); + n_batches = src->info()->tensor_shape()[3]; + break; + } + default: + { + ARM_COMPUTE_ERROR("Invalid source data layout."); + break; + } + } + + // CHW -> HWC + if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) + { + const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); + const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); + const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); + const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; + reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), + reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols, + in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride, + out_row_stride, out_col_stride); + }, + src_it, dst_it); + } + // HWC -> CHW + else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}) + { + const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); + const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); + const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); + const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; + reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), + reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels, + in_batch_stride, in_row_stride, in_col_stride, out_batch_stride, + out_channel_stride, out_row_stride); + }, + src_it, dst_it); + } + else + { + // All other cases fall back to C++ + // Permute strides + Strides strides = dst->info()->strides_in_bytes(); + Strides perm_strides = strides; + permute_strides(perm_strides, perm); + const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0; + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = + id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; + *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr())); + }, + src_it, dst_it); + } +} +} // namespace + +void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); + // Destination auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm)); + + _perm = perm; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + // This kernel doesn't need padding so update_window_and_padding() can be skipped + + ICpuKernel::configure(win); +} + +Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm)); + return Status{}; +} + +void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch (src->info()->element_size()) + { + case 1: + run_permute<uint8_t>(window, src, dst, _perm); + break; + case 2: + run_permute<uint16_t>(window, src, dst, _perm); + break; + case 4: + run_permute<uint32_t>(window, src, dst, _perm); + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} + +const char *CpuPermuteKernel::name() const +{ + return "CpuPermuteKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h new file mode 100644 index 0000000000..0cb2faf223 --- /dev/null +++ b/src/cpu/kernels/CpuPermuteKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H +#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform tensor permutation given a permutation vector */ +class CpuPermuteKernel : public ICpuKernel<CpuPermuteKernel> +{ +public: + CpuPermuteKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel); + /** Configure kernel for a given list of arguments + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] src Srouce tensor to permute. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuPermuteKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PermutationVector _perm{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp new file mode 100644 index 0000000000..2c9627bdee --- /dev/null +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuPool2dKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +using namespace misc::shape_calculator; + +static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = { + {"neon_qu8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)}, + {"neon_qs8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)}, + {"neon_f16_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)}, + {"neon_fp32_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)}, +#if defined(ENABLE_NCHW_KERNELS) + {"neon_qu8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)}, + {"neon_qu8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)}, + {"neon_qu8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)}, + {"neon_qs8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)}, + {"neon_qs8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)}, + {"neon_qs8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)}, + {"neon_fp16_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)}, + {"neon_fp16_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)}, + {"neon_fp16_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)}, + {"neon_fp32_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool7", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)}, + {"neon_fp32_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)}, +#endif /* defined(ENABLE_NCHW_KERNELS) */ +}; + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices, + Size2D pool_size) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0); + + int pool_stride_x = 0; + int pool_stride_y = 0; + int output_width = 0; + int output_height = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)), + "Pooling region that is entirely outside input tensor is unsupported for non-float types"); + + std::tie(output_width, output_height) = + scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(), + pool_size.y(), pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), + "Calculated output dimension size is invalid"); + + TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + if (indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && + (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() && + (src->data_layout() == DataLayout::NHWC), + "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + if (indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), + "Pooling indices returning source tensor coordinates is only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), + "Pooling kernel indices only supported for NHWC"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info); + } + } + + const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ + src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, + ITensorInfo *dst, + ITensorInfo *indices, + const PoolingLayerInfo &pool_info, + unsigned int &num_elems_processed_per_iteration, + int pool_size_x, + int pool_size_y) +{ + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))); + if (indices) + { + // Indices auto inizialitation if not yet initialized + auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))) + .set_data_type(DataType::U32) /* we store the offset to the element */); + } + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + + int pool_stride_x = 0; + int pool_stride_y = 0; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const bool is_square = pool_size_x == pool_size_y; + const unsigned int pooled_w = dst->dimension(idx_width); + const unsigned int pooled_h = dst->dimension(idx_height); + + //If it's not squared and optimized will be executed the MxN + num_elems_processed_per_iteration = 1; + + if (is_square) + { + switch (src->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + switch (pool_size_x) + { + case 2: + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; + break; + case 3: + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; + break; + default: + break; + } + break; + case DataType::F16: + num_elems_processed_per_iteration = 1; + break; + case DataType::F32: + num_elems_processed_per_iteration = 1; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + } + + bool window_changed = false; + Window win{}; + // Upper limit for the number of right/bottom border elements that are accessed + TensorShape dst_shape{src->tensor_shape()}; + dst_shape.set(0, pooled_w); + dst_shape.set(1, pooled_h); + TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape)); + win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration)); + + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +void CpuPool2dKernel::configure(ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool is_global_pooling = pool_info.is_global_pooling; + + // Get data layout + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Update pool size in case of global pooling + const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); + + const auto *uk = CpuPool2dKernel::get_implementation( + PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, + pool_size, CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + + // Set instance variables + _pool_info = pool_info; + _data_layout = src->data_layout(); + _pool_size = pool_size; + _pool_stride_x = pad_stride_info.stride().first; + _run_method = uk->ukernel; + _name = std::string("CpuPool2dKernel").append("/").append(uk->name); + + if (_data_layout == DataLayout::NHWC) + { + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); + } + else + { + // Configure kernel window + auto win_config = validate_and_configure_window( + src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); + } +} + +Status CpuPool2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + unsigned int num_elems_processed_per_iteration = 0; + + const bool is_global_pooling = pool_info.is_global_pooling; + + // Get data layout + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y))); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), + (indices) ? indices->clone().get() : nullptr, pool_info, + num_elems_processed_per_iteration, pool_size_x, + pool_size_y) + .first); + + return Status{}; +} + +void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0); + ITensor *indices = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first; + const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second; + const unsigned int pool_size = _pool_info.pool_size.width; + + Window window_src(window); + if (_data_layout == DataLayout::NCHW) + { + // Set step for src in x and y direction for the src + unsigned int window_x_inc = 0; + switch (src->info()->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + { + window_x_inc = pool_stride_x; + if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + { + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 + : _num_elems_processed_per_iteration; + } + break; + } + + case DataType::F16: + case DataType::F32: + { + window_x_inc = pool_stride_x; + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, + window.x().end() * pool_stride_x, window_x_inc)); + window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, + window.y().end() * pool_stride_y, pool_stride_y)); + } + else + { + window_src.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); + window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); + } + _run_method(src, dst, indices, _pool_info, window_src, window); +} + +const char *CpuPool2dKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuPool2dKernel::PoolingKernel> &CpuPool2dKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h new file mode 100644 index 0000000000..859de8cc5f --- /dev/null +++ b/src/cpu/kernels/CpuPool2dKernel.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H +#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H + +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the pooling layer kernel */ +class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel> +{ +private: + using PoolingKernelPtr = std::add_pointer<void( + const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type; + +public: + CpuPool2dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel); + /** Configure kernel for a given list of arguments + * + * @note F16 are supported for pool sizes 2 and 3 only + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: Same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct PoolingKernel + { + const char *name; + const PoolDataTypeISASelectorPtr is_selected; + PoolingKernelPtr ukernel; + }; + + static const std::vector<PoolingKernel> &get_available_kernels(); + +private: + PoolingLayerInfo _pool_info{}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{0}; + Size2D _pool_size{}; + int _pool_stride_x{}; + PoolingKernelPtr _run_method{nullptr}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp new file mode 100644 index 0000000000..8b484d4e0b --- /dev/null +++ b/src/cpu/kernels/CpuPool3dKernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuPool3dKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool3d/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +using namespace misc::shape_calculator; + +static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = { + {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)}, + {"neon_qs8_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)}, + {"neon_fp16_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)}, + {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && + (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)), + "Exclude padding is unsupported for non-float types for Avg op"); + + const auto data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH); + + const bool is_global_pooling = pool_info.is_global_pooling; + const unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const unsigned int pool_size_z = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; + + const unsigned int stride_x = pool_info.stride.x(); + const unsigned int stride_y = pool_info.stride.y(); + const unsigned int stride_z = pool_info.stride.z(); + + ARM_COMPUTE_RETURN_ERROR_ON((pool_size_x == 0) || (pool_size_y == 0) || (pool_size_z == 0)); + ARM_COMPUTE_RETURN_ERROR_ON((stride_x == 0) || (stride_y == 0) || (stride_z == 0)); + + int output_width = 0; + int output_height = 0; + int output_depth = 0; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); + + std::tie(output_width, output_height, output_depth) = + scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), + "Calculated output dimension size is invalid"); + + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + TensorInfo out_info( + TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + } + + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} +} //namespace + +void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info)); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool3d_shape(src->tensor_shape(), pool_info))); + + // Get data layout + const auto data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH); + + // Update pool size in case of global pooling + const bool is_global_pooling = pool_info.is_global_pooling; + const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height, + is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth); + + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + + // Set instance variables + _pool_info = pool_info; + _run_method = uk->ukernel; + _name = std::string("CpuPool3dKernel").append("/").append(uk->name); + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info)); + + return Status{}; +} + +void CpuPool3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0); + + _run_method(src, dst, _pool_info, window); +} + +const char *CpuPool3dKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h new file mode 100644 index 0000000000..bd1ff61046 --- /dev/null +++ b/src/cpu/kernels/CpuPool3dKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL3D_KERNEL_H +#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H + +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform Pooling 3D. */ +class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel> +{ +private: + /* Template function for Pooling 3D NDHWC */ + using Pooling3dKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type; + +public: + CpuPool3dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3dKernel); + /** Set the src, dst tensor and pooling info. + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: Same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool3dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct Pooling3dKernel + { + const char *name; + const DataTypeISASelectorPtr is_selected; + Pooling3dKernelPtr ukernel; + }; + + static const std::vector<Pooling3dKernel> &get_available_kernels(); + +private: + Pooling3dLayerInfo _pool_info{}; + Pooling3dKernelPtr _run_method{nullptr}; + std::string _name{}; +}; + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp new file mode 100644 index 0000000000..ed4675ae3d --- /dev/null +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuQuantizeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/quantize/generic/neon/list.h" + +#include <arm_neon.h> +#include <map> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + + return Status{}; +} + +} // namespace + +void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = { + {"op_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_QASYMM16", REGISTER_INTEGER_NEON(u8_run_quantize_qasymm16)}, + + {"op_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_quantize_qasymm8)}, + {"op_QASYMM8_SIGNED_QASYMM16", REGISTER_INTEGER_NEON(i8_run_quantize_qasymm16)}, + + // Functions for offset only requantization + {"op_OFFSET_ONLY_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only)}, + {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_requantize_offset_only)}, + + // Functions for offset uint8 to int8 and vice versa quantization (no scale changes) + {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8", + REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only_convert)}, + {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED", + REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only_convert)}, + + {"op_F32_QSYMM8", REGISTER_FP32_NEON(fp32_i8_run_quantize_qsymm8)}, + {"op_F32_QASYMM8", REGISTER_FP32_NEON(fp32_u8_run_quantize_qasymm8)}, + {"op_F32_QASYMM8_SIGNED", REGISTER_FP32_NEON(fp32_i8_run_quantize_qasymm8)}, + {"op_F32_QASYMM16", REGISTER_FP32_NEON(fp32_run_quantize_qasymm16)}, + +#ifdef ARM_COMPUTE_ENABLE_FP16 + {"op_F16_QASYMM8", REGISTER_FP16_NEON(fp16_u8_run_quantize_qasymm8)}, + {"op_F16_QASYMM8_SIGNED", REGISTER_FP16_NEON(fp16_i8_run_quantize_qasymm8)}, + {"op_F16_QASYMM16", REGISTER_FP16_NEON(fp16_run_quantize_qasymm16)}, +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + }; + + std::string function_to_call("op_"); + + // For offset only functions - must be 8-bit and have identical scale values. + if (src->quantization_info().scale() == dst->quantization_info().scale() && + (is_data_type_quantized_asymmetric_char(src->data_type()) && + is_data_type_quantized_asymmetric_char(dst->data_type()))) + { + function_to_call += "OFFSET_ONLY_"; + // For optimized datatype conversion 8-bit re-quantization offset only functions. + // These must have an offset of exactly 128 to match requirements - has specific circumstances to match use case. + auto uqinfo = + compute_requantization_scale_offset(src->quantization_info().uniform(), dst->quantization_info().uniform()); + const auto src_dt = src->data_type(); + if (src->data_type() != dst->data_type() && ((src_dt == DataType::QASYMM8_SIGNED && uqinfo.offset == 128) || + (src_dt == DataType::QASYMM8 && uqinfo.offset == -128))) + { + function_to_call += "CONVERT_"; + } + } + + // Specify datatype for function + function_to_call += string_from_data_type(src->data_type()) + "_"; + function_to_call += string_from_data_type(dst->data_type()); + + auto it = quant_map.find(function_to_call); + + if (it == quant_map.end()) + { + ARM_COMPUTE_ERROR("Unsupported combination of input and output data types"); + } + _func = it->second; + + // Calculate window. Squash if possible. + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src); + + ICpuKernel::configure(win); +} + +Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (*_func)(src, dst, window); +} + +const char *CpuQuantizeKernel::name() const +{ + return "CpuQuantizeKernel"; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h new file mode 100644 index 0000000000..750310c811 --- /dev/null +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the quantization layer kernel. + * + * @note The implementation supports only 3D input tensors + */ +class CpuQuantizeKernel : public ICpuKernel<CpuQuantizeKernel> +{ +public: + CpuQuantizeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel); + /** Set the input, output. + * + * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this kernel + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuQuantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension hint. + */ + size_t get_split_dimension_hint() const + { + return _split_dimension; + } + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the specialised @ref CpuQuantizeKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeFunctionExecutorPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window); + QuantizeFunctionExecutorPtr _func{nullptr}; + size_t _split_dimension{Window::DimY}; +}; + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUQUANTIZEKERNEL_H diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp new file mode 100644 index 0000000000..241e58fbce --- /dev/null +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuReshapeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/Utils.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" + +#include <cstdint> + +/** [NEReshapeLayerKernel Kernel] **/ +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if (dst->tensor_shape().total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +template <typename T> +void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst) +{ + const TensorShape &src_shape = src->info()->tensor_shape(); + const TensorShape &dst_shape = dst->info()->tensor_shape(); + + Iterator dst_it(dst, window); + + execute_window_loop( + window, + [&](const Coordinates &dst_coord) + { + Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + const auto output_ptr = dst->ptr_to_element(dst_coord); + const auto input_ptr = src->ptr_to_element(src_coord); + + *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr); + }, + dst_it); +} + +void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst) +{ + switch (src->info()->data_type()) + { + case DataType::U8: + case DataType::S8: + case DataType::QSYMM8: + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + reshape_tensor_per_element<uint8_t>(window, src, dst); + break; + case DataType::U16: + case DataType::S16: + case DataType::F16: + reshape_tensor_per_element<uint16_t>(window, src, dst); + break; + case DataType::U32: + case DataType::S32: + case DataType::F32: + reshape_tensor_per_element<uint32_t>(window, src, dst); + break; + case DataType::U64: + case DataType::S64: + case DataType::F64: + reshape_tensor_per_element<uint64_t>(window, src, dst); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } +} + +void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *dst) +{ + const TensorShape &src_shape = src->info()->tensor_shape(); + const TensorShape &dst_shape = dst->info()->tensor_shape(); + Coordinates src_coord{}; + Coordinates dst_coord{}; + + const auto element_size = dst->info()->element_size(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const auto src_row_size = static_cast<int>(src_shape[0]); + const auto row_size_in_bytes = src_row_size * element_size; + + auto output_ptr = dst->ptr_to_element(dst_coord); + auto input_ptr = src->ptr_to_element(src_coord); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator dst_it(dst, win); + execute_window_loop( + win, + [&](Coordinates &id) + { + dst_coord = id; + + for (int x = window_start_x; x < window_end_x; x += src_row_size) + { + src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + output_ptr = dst->ptr_to_element(dst_coord); + input_ptr = src->ptr_to_element(src_coord); + + std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + + dst_coord.increment(Window::DimX, src_row_size); + } + }, + dst_it); +} + +void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst) +{ + Iterator src_it(src, window); + Iterator dst_it(dst, window); + + const size_t element_size = dst->info()->element_size(); + const auto window_size = window.x().end() - window.x().start(); + const auto window_size_in_bytes = window_size * element_size; + + const auto input_ptr = src_it.ptr(); + const auto output_ptr = dst_it.ptr(); + + std::memcpy(output_ptr, input_ptr, window_size_in_bytes); +} +} // namespace + +void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + ARM_COMPUTE_UNUSED(src); + + _reshape_tensor_fn = reshape_tensor_per_element_selector; + // Configure kernel window + Window win = calculate_max_window(*dst); + + ICpuKernel::configure(win); +} + +Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + _reshape_tensor_fn(window, src, dst); +} + +const char *CpuReshapeKernel::name() const +{ + return "CpuReshapeKernel"; +} + +size_t CpuReshapeKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; +} + +void CpuReshapeKernel::prepare(ITensorPack &tensors) +{ + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const ITensorInfo *src_info = src->info(); + const ITensorInfo *dst_info = dst->info(); + + // Calculate kernel window based on the padding info + Window win; + + const bool src_has_holes = has_holes(*src_info, src_info->num_dimensions() - 1); + const bool dst_has_holes = has_holes(*dst_info, dst_info->num_dimensions() - 1); + const bool src_has_holes_in_x = has_holes(*src_info, Window::DimX); + const bool dst_has_holes_in_x = has_holes(*dst_info, Window::DimX); + const auto src_row_size = static_cast<int>(src_info->tensor_shape()[0]); + const auto dst_row_size = static_cast<int>(dst_info->tensor_shape()[0]); + + if (!src_has_holes && !dst_has_holes) + { + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info); + /* + Copy the tensor per window. If the src and dst tensors + are contiguous memory allocations without any holes or + padding, then the tensor is squashed to 1D window and + we can use use a single memcopy call to copy the whole + window in reshape_tensor_per_window fn + */ + _reshape_tensor_fn = reshape_tensor_per_window; + } + else + { + win = calculate_max_window(*dst_info); + /* + Copy tensor row by row if src and dst have no holes in X + dim and they have the same number of elements in their rows + */ + if (!src_has_holes_in_x && !dst_has_holes_in_x && (src_row_size == dst_row_size)) + { + _reshape_tensor_fn = reshape_tensor_per_row; + } + else + { + /* + Fall back to the element wise copy + */ + _reshape_tensor_fn = reshape_tensor_per_element_selector; + } + } + + ICPPKernel::configure(win); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +/** [NEReshapeLayerKernel Kernel] **/ diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h new file mode 100644 index 0000000000..ce566fd9e2 --- /dev/null +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H +#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform tensor reshaping */ +class CpuReshapeKernel : public ICpuKernel<CpuReshapeKernel> +{ +public: + CpuReshapeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination tensor info. Data type supported: Same as @p input + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes + * + * @param[in] tensors Pack of input and output tensors + * + */ + void prepare(ITensorPack &tensors); + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requsted configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. + * + * @return The split dimension. + */ + size_t get_split_dimension() const + { + return _split_dimension; + } + +private: + size_t _split_dimension{Window::DimY}; + + std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp new file mode 100644 index 0000000000..7cf8916e9b --- /dev/null +++ b/src/cpu/kernels/CpuScaleKernel.cpp @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuScaleKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/InterpolationPolicyUtils.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/scale/neon/list.h" +#include "src/cpu/kernels/scale/sve/list.h" +#include "support/Rounding.h" + +#include <arm_neon.h> +#include <map> + +namespace arm_compute +{ +namespace cpu +{ + +#ifdef ENABLE_NCHW_KERNELS +void scale_area_nchw_u8(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, offsets, policy, border_mode, constant_border_value, sampling_offset, align_corners); + using namespace scale_helpers; + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); + + // Don't increment in width/height/channels for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + + const auto wr = + scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const auto w = src->info()->dimension(0); + const auto h = src->info()->dimension(1); + const size_t in_stride = src->info()->strides_in_bytes()[1]; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr()); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); + + vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); + }, + src_i, dst_i); +} + +template <typename T> +void scale_bilinear_qasymm_nchw(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + // Get data layout and width/height indices + const int idx_width = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), + dst->info()->dimension(idx_height), align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(idx_width, Window::Dimension(0, 0, 0)); + win_in.set(idx_height, Window::Dimension(0, 0, 0)); + + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + + const int32_t in_dim_w = src->info()->dimension(idx_width); + const int32_t in_dim_h = src->info()->dimension(idx_height); + const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; + const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if (border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); + } + else if (border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +/** function to perform scale using bilinear interpolation on the given window */ +template <typename T> +void scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + arm_compute::cpu::scale_bilinear_nchw<T>(src, dst, dx, dy, offsets, border_mode, constant_border_value, + sampling_offset, align_corners, window); +} + +template <typename T> +void scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy, border_mode); + arm_compute::cpu::scale_nearest_nchw<T>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset, + align_corners, window); +} + +#endif // ENABLE_NCHW_KERNELS + +namespace kernels +{ +namespace +{ +static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = { + {"sve_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)}, + {"sve_fp32_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)}, + {"sve_qu8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)}, + {"sve_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)}, + {"sve_u8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)}, + {"sve_s16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)}, + {"neon_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_common_neon_scale)}, + {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)}, + {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)}, + {"neon_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)}, + {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)}, + {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)}, + {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)}, +}; + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) +{ + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); + + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst == src); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_UNUSED(info.constant_border_value); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); + + const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const auto output_width = dst->dimension(width_index); + const auto output_height = dst->dimension(height_index); + ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0); + + ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && + (data_layout != DataLayout::NHWC || + info.interpolation_policy != InterpolationPolicy::BILINEAR || + info.border_mode != BorderMode::REPLICATE)); + + if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + } + + if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + if (dx != nullptr && dy != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); + } + } + + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && + !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + + if (info.interpolation_policy == InterpolationPolicy::AREA) + { + ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); + } + + return Status{}; +} +} // namespace + +void CpuScaleKernel::configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(dx, dy, offsets); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info)); + + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuScaleKernel") + .append("/") + .append(uk->name) + .append("_") + .append(string_from_interpolation_policy(info.interpolation_policy)); + + // Get data layout and width/height indices + _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + _policy = info.interpolation_policy; + _border_mode = info.border_mode; + _constant_border_value = info.constant_border_value; + _align_corners = info.align_corners; + + if (info.sampling_policy == SamplingPolicy::CENTER) + { + _sampling_offset = 0.5f; + } + + // Compute the ratio between source width/height and destination width/height + const auto wr = + scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR + : _policy; + + if (_border_mode == BorderMode::UNDEFINED) + { + _border_mode = BorderMode::CONSTANT; + _constant_border_value = PixelValue(); + } + +#ifdef ENABLE_NCHW_KERNELS + // Configure scale function to run + if (_data_layout == DataLayout::NCHW) + { + std::string function_to_call("scale_"); + function_to_call += string_from_data_type(src->data_type()) + "_"; + function_to_call += string_from_data_layout(_data_layout) + "_"; + function_to_call += string_from_interpolation_policy(_policy); + + const static std::map<std::string, ScaleKernelPtr> map_nchw_function = { + {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8}, + {"scale_U8_NCHW_AREA_CONSTANT", &arm_compute::cpu::scale_area_nchw_u8}, + {"scale_U8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<uint8_t>}, + {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>}, + {"scale_QASYMM8_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<uint8_t>}, + {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<uint8_t>}, + {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_qasymm_nchw<int8_t>}, + {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int8_t>}, + {"scale_S16_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<int16_t>}, + {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<int16_t>}, + {"scale_F16_NCHW_BILINEAR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_bilinear_neon_scale_nchw)}, + {"scale_F16_NCHW_NEAREST_NEIGHBOUR", REGISTER_FP16_NEON(arm_compute::cpu::fp16_nearest_neon_scale_nchw)}, + {"scale_F32_NCHW_BILINEAR", &arm_compute::cpu::scale_bilinear_nchw<float>}, + {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &arm_compute::cpu::scale_nearest_nchw<float>}, + }; + auto it = map_nchw_function.find(function_to_call); + if (it != map_nchw_function.end()) + { + _nchw_func = it->second; + } + } +#endif // ENABLE_NCHW_KERNELS + + // Configure window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuScaleKernel::validate(const ITensorInfo *input, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *output, + const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); + return Status{}; +} + +void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_nchw_func == nullptr && _data_layout == DataLayout::NCHW); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + const auto dx = tensors.get_const_tensor(TensorType::ACL_INT_0); + const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1); + const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2); + + if (_data_layout == DataLayout::NCHW) + { + _nchw_func(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, + _align_corners, window); + } + else + { + _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, + _align_corners, window); + } +} + +const char *CpuScaleKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuScaleKernel::ScaleKernel> &CpuScaleKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h new file mode 100644 index 0000000000..f2cad5e899 --- /dev/null +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */ +class CpuScaleKernel : public ICpuKernel<CpuScaleKernel> +{ +private: + /** Scale function to use for the particular function to use */ + using ScaleKernelPtr = std::add_pointer<void(const ITensor *, + ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + InterpolationPolicy, + BorderMode, + PixelValue, + float, + bool, + const Window &)>::type; + +public: + CpuScaleKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel); + /** Initialise the kernel's inputs, output and interpolation policy + * + * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor + * @note Using @p policy Area only supports data layout NCHW and input data type U8. + * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. + * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32 + * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32 + * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. + * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo to use for configuration + */ + void configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct ScaleKernel + { + const char *name; + const ScaleKernelDataTypeISASelectorDataPtr is_selected; + ScaleKernelPtr ukernel; + }; + + static const std::vector<ScaleKernel> &get_available_kernels(); + +private: + ScaleKernelPtr _nchw_func{nullptr}; + InterpolationPolicy _policy{}; + BorderMode _border_mode{}; + PixelValue _constant_border_value{}; + float _sampling_offset{0}; + bool _align_corners{false}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + ScaleKernelPtr _run_method{nullptr}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUSCALEKERNEL_H diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp new file mode 100644 index 0000000000..b7e395fb79 --- /dev/null +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuSoftmaxKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/Utils.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/softmax/list.h" + +#include <vector> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ + +/* Softmax */ +static const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> available_kernels = { + {"sme2_fp32_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F32 && data.isa.sme2 && data.axis == 0); }, + REGISTER_FP32_SME2(sme2_fp32_softmax)}, + {"neon_fp32_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<false>)}, + {"sme2_fp16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F16 && data.isa.sme2 && data.axis == 0); }, + REGISTER_FP16_SME2(sme2_fp16_softmax)}, + {"neon_fp16_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<false>)}, + {"sme2_qu8_softmax_lut_512VL", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { + return (!data.is_log && data.dt == DataType::QASYMM8 && data.isa.sme2 && data.axis == 0 && + data.sme2_vector_length == 512); + }, + REGISTER_QASYMM8_SME2(sme2_qasymm8_softmax_lut_512VL)}, + {"neon_qu8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (!data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<false>)}, + {"sme2_qs8_softmax_lut_512VL", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { + return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED && data.isa.sme2 && data.axis == 0 && + data.sme2_vector_length == 512); + }, + REGISTER_QASYMM8_SIGNED_SME2(sme2_qasymm8_signed_softmax_lut_512VL)}, + {"neon_qs8_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (!data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<false>)}, + {"neon_fp32_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax<true>)}, + {"neon_fp16_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax<true>)}, + {"neon_qu8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) { return (data.is_log && data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax<true>)}, + {"neon_qs8_log_softmax", + [](const SoftmaxKernelDataTypeISASelectorData &data) + { return (data.is_log && data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax<true>)}, +}; + +void init_lut(std::vector<float> &lut, DataType type, float scale, float beta) +{ + if (type == DataType::QASYMM8) + { + for (int i = 0; i < 256; ++i) + { + lut.push_back(std::exp(-scale * beta * i)); + } + } + else if (type == DataType::QASYMM8_SIGNED) + { + for (int i = -128; i < 128; ++i) + { + lut.push_back(std::exp(-scale * beta * i)); + } + } + else + { + ARM_COMPUTE_ERROR("Invalid datatype for QASYMM8/QASYMM8_SIGNED softmax"); + } +} + +Status validate_arguments_softmax( + const ITensorInfo &src, const ITensorInfo &dst, float beta, int axis, const ITensorInfo &tmp, bool is_log) +{ + ARM_COMPUTE_UNUSED(beta); + // Check input + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON(axis < 0 || axis > 3); + + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); + + // Check output if configured + if (dst.total_size() != 0) + { + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) + : dst.quantization_info(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); + } + + // Check tmp if configured + if (tmp.total_size() != 0) + { + // We have temporary storage only if src data type is quantized. + // Therefore, tmp data type must be F32 + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(!is_quantized_asymmetric); + + // We could potentially reduce tmp memory if we could predict or make an assumption + // on the maximum number of threads that will run in parallel. + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); + } + + return Status{}; +} +} // namespace + +const std::vector<typename CpuSoftmaxKernel::SoftmaxKernel> &CpuSoftmaxKernel::get_available_kernels() +{ + return available_kernels; +} + +void CpuSoftmaxKernel::configure( + const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, int axis, ITensorInfo *tmp) +{ + _axis = axis; + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_softmax(*src, *dst, beta, axis, *tmp, is_log)); + + // Configure kernel window + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); + + // Output auto initialization if not yet initialized + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), is_log) + : dst->quantization_info(); + auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); + + // Tmp auto initialization if not yet initialized and src is quantized + if (is_quantized_asymmetric) + { + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(DataType::F32).reset_padding()); + } + + const auto *uk = CpuSoftmaxKernel::get_implementation(SoftmaxKernelDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_isa(), is_log, axis, CPUInfo::get().get_sme2_vector_length()}); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + std::string kernel_name = is_log ? std::string("CpuLogSoftmaxKernel") : std::string("CpuSoftmaxKernel"); + + _beta = beta; + _run_method = uk->ukernel; + _name = kernel_name.append("/").append(uk->name); + + Window win; + + int vec_size = 16 / dst->element_size(); + + if (_axis == 0) + { + win = calculate_max_window(*dst, Steps()); + + /// TODO:Check dimensions > 0 for holes only. For this, we need + /// a utility function checking if there are holes after some dimension. + if (!has_holes(*dst, dst->num_dimensions() - 1)) + { + win = win.collapse(win, Window::DimY); + } + } + else if (_axis > 0 && _axis <= 3) + { + win = calculate_max_window(*dst, Steps(vec_size)); + } + else + { + ARM_COMPUTE_ERROR("Invalid axis"); + } + + win.set(_axis, Window::Dimension(0, 1, 1)); + + ICpuKernel<CpuSoftmaxKernel>::configure(win); + + const std::string uk_name = uk->name; + if (uk_name == "sme2_qu8_softmax_lut_512VL" || uk_name == "sme2_qs8_softmax_lut_512VL") + { + const float scale = src->quantization_info().uniform().scale; + init_lut(_lut, src->data_type(), scale, beta); + } +} + +Status CpuSoftmaxKernel::validate( + const ITensorInfo *src, const ITensorInfo *dst, float beta, int axis, bool is_log, const ITensorInfo *tmp) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_softmax(*src, *dst, beta, axis, *tmp, is_log)); + + return Status{}; +} + +void CpuSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel<CpuSoftmaxKernel>::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); + unsigned int num_elems_processed_per_iteration; + if (_axis == 0) + { + num_elems_processed_per_iteration = src->info()->valid_region().shape[_axis]; + } + else + { + //16 QASYMM8/QASYMM8_SIGNED elements can fit into the 16-byte vectors. + num_elems_processed_per_iteration = 16; + } + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + _run_method(src, tmp_for_thread, dst, _beta, _axis, window, _lut.data()); + } + else + { + _run_method(src, nullptr, dst, _beta, _axis, window, nullptr); + } +} + +const char *CpuSoftmaxKernel::name() const +{ + return _name.c_str(); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h new file mode 100644 index 0000000000..676e79782b --- /dev/null +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2017-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H +#define ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for softmax computation */ +class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel> +{ +private: + using SoftmaxKernelPtr = std::add_pointer<void( + const ITensor *, void *const, ITensor *, float, int, const Window &, const float *)>::type; + +public: + CpuSoftmaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSoftmaxKernel); + + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * @param[in] is_log True if the operation is log-softmax. + * @param[in] axis The axis along which to perform the softmax operation. + * + * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta, bool is_log, int axis, ITensorInfo *tmp); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuSoftmaxKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int axis, bool is_log, const ITensorInfo *tmp); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + struct SoftmaxKernel + { + const char *name; + const SoftmaxKernelDataTypeISASelectorDataPtr is_selected; + SoftmaxKernelPtr ukernel; + }; + + static const std::vector<SoftmaxKernel> &get_available_kernels(); + +private: + float _beta{1.0f}; + SoftmaxKernelPtr _run_method{nullptr}; + std::string _name{}; + int _axis{}; + std::vector<float> _lut = {}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_CPUSOFTMAXKERNEL_H diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp new file mode 100644 index 0000000000..c8706ff651 --- /dev/null +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuSubKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/add/generic/neon/impl.h" +#include "src/cpu/kernels/sub/neon/impl.h" +#include "src/cpu/kernels/sub/neon/list.h" + +#if defined(ENABLE_FP32_KERNELS) +namespace +{ +static constexpr size_t default_mws_N1_fp32_neon = 24385; +static constexpr size_t default_mws_V1_fp32_neon = 40520; +} // namespace +#endif /* ENABLE_FP32_KERNELS */ + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +using CpuSubKernelDataTypeISASelectorData = CpuAddKernelDataTypeISASelectorData; +using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; + +static const std::vector<CpuSubKernel::SubKernel> available_kernels = { + {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)}, + {"neon_fp16_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon_fp16)}, + {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)}, + {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)}, + {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)}, + {"neon_qu8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)}, + {"neon_qs8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)}, + {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)}, + {"neon_qs8_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)}, + {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)}, +}; + +inline Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, + DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst); + const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>( + CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP), + "Convert policy cannot be WRAP if datatype is quantized"); + + // Validate in case of configured dst + if (dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + return Status{}; +} +} // namespace + +void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + set_data_type_if_unknown(*dst, src0->data_type()); + + const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst); + const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>( + CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); + + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuSubKernel").append("/").append(uk->name); + + // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped + Window win; + std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src0, *src1); + + ICpuKernel::configure(win); +} + +size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + +#if defined(ENABLE_FP32_KERNELS) + if (this->_run_method == &sub_same_neon<float>) + { + size_t mws = ICPPKernel::default_mws; + if (platform.get_cpu_model() == CPUModel::N1) + { + mws = default_mws_N1_fp32_neon; + } + else if (platform.get_cpu_model() == CPUModel::V1) + { + mws = default_mws_V1_fp32_neon; + } + else + { + return ICPPKernel::default_mws; + } + + // tensor is 1D or was re-interpreted as 1D + if (this->window().shape().num_dimensions() == 1) + { + return mws; + } + else + { + // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one + // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small + // but the other sizes are large, which boosts performance. + mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1))); + return std::max(static_cast<size_t>(1), mws); + } + } +#else /* ENABLE_FP32_KERNELS */ + ARM_COMPUTE_UNUSED(platform); +#endif /* ENABLE_FP32_KERNELS */ + return ICPPKernel::default_mws; +} + +Status +CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); + + return Status{}; +} + +void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, _policy, window); +} + +const char *CpuSubKernel::name() const +{ + return _name.c_str(); +} + +const std::vector<CpuSubKernel::SubKernel> &CpuSubKernel::get_available_kernels() +{ + return available_kernels; +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h new file mode 100644 index 0000000000..5fa0dc411a --- /dev/null +++ b/src/cpu/kernels/CpuSubKernel.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H +#define ARM_COMPUTE_CPU_SUB_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform subtraction between two tensors */ +class CpuSubKernel : public ICpuKernel<CpuSubKernel> +{ +private: + using SubKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; + +public: + CpuSubKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel); + + /** Initialise the kernel's src and dst. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. + * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuSubKernel::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + + struct SubKernel + { + const char *name; + const CpuSubKernelDataTypeISASelectorDataPtr is_selected; + SubKernelPtr ukernel; + }; + + static const std::vector<SubKernel> &get_available_kernels(); + + size_t get_split_dimension() const + { + return _split_dimension; + } + +private: + ConvertPolicy _policy{}; + SubKernelPtr _run_method{nullptr}; + std::string _name{}; + size_t _split_dimension{Window::DimY}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */ diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp new file mode 100644 index 0000000000..0f762ba041 --- /dev/null +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuTransposeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +unsigned int num_elems_processed(size_t element_size) +{ + switch (element_size) + { + case 1: + return 8; + case 2: + return 4; + case 4: +#ifdef __aarch64__ + return 8; +#else // __aarch64__ + return 4; +#endif // __aarch64__ + default: + break; + } + + ARM_COMPUTE_ERROR("Element size not supported"); +} + +void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 8; + const int window_step_y = 8; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if (left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if (window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if (in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x8_t row0 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes)); + const uint8x8_t row1 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes)); + const uint8x8_t row2 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes)); + const uint8x8_t row3 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes)); + const uint8x8_t row4 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes)); + const uint8x8_t row5 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes)); + const uint8x8_t row6 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes)); + const uint8x8_t row7 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); + const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); + const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); + const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); + + // Transpose 4x4 + const uint16x4x2_t k0_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); + const uint16x4x2_t k1_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); + const uint16x4x2_t k2_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); + const uint16x4x2_t k3_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); + + // Transpose 8x8 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); + const uint32x2x2_t k2_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); + const uint32x2x2_t k3_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); + } + + // Compute left-over elements along the x dimension (1x8) + for (; x < window_end_x; ++x) + { + const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); + const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); + const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); + const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); + const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); + const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); + const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); + const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); + + uint8x8_t result = vdup_n_u8(0); + result = vset_lane_u8(val0, result, 0); + result = vset_lane_u8(val1, result, 1); + result = vset_lane_u8(val2, result, 2); + result = vset_lane_u8(val3, result, 3); + result = vset_lane_u8(val4, result, 4); + result = vset_lane_u8(val5, result, 5); + result = vset_lane_u8(val6, result, 6); + result = vset_lane_u8(val7, result, 7); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8(output.ptr() + dst_offset_in_bytes, result); + } + }, + input, output); + } + + if (left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint8_t val0 = *input.ptr(); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; + + *(output.ptr() + dst_offset_in_bytes) = val0; + }, + input, output); + } +} + +void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if (left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if (window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if (in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x4_t row0 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16x4_t row1 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16x4_t row2 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16x4_t row3 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); + const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); + + // Transpose 4x4 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[0])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[0])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[1])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint16x4_t result = vdup_n_u16(0); + result = vset_lane_u16(val0, result, 0); + result = vset_lane_u16(val1, result, 1); + result = vset_lane_u16(val2, result, 2); + result = vset_lane_u16(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); + } + + if (left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr())); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; + + *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); + } +} + +#ifdef __aarch64__ +inline uint32x4x2_t vld1q_u32_x2_(const uint32_t *ptr) +{ + // gcc-7 doesn't support vld1q_u32_x2 instruction + return {vld1q_u32(ptr), vld1q_u32(ptr + 4)}; +} + +inline void vst1q_u32_x2_(const uint32_t *ptr, const uint32x4x2_t &val) +{ + // gcc-7 doesn't support vst1q_u32_x2 instruction + vst1q_u32(const_cast<uint32_t *>(ptr), val.val[0]); + vst1q_u32(const_cast<uint32_t *>(ptr + 4), val.val[1]); +} + +void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + constexpr int window_step_x = 8; + constexpr int window_step_y = 8; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if (left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if (window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if (in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load + const uint32x4x2_t row0 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4x2_t row1 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4x2_t row2 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4x2_t row3 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32x4x2_t row4 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32x4x2_t row5 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32x4x2_t row6 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32x4x2_t row7 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); + + // Transpose 2x4 + const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), + vtrn2q_u32(row0.val[0], row1.val[0])}; + const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), + vtrn2q_u32(row0.val[1], row1.val[1])}; + const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), + vtrn2q_u32(row2.val[0], row3.val[0])}; + const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), + vtrn2q_u32(row2.val[1], row3.val[1])}; + const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), + vtrn2q_u32(row4.val[0], row5.val[0])}; + const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), + vtrn2q_u32(row4.val[1], row5.val[1])}; + const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), + vtrn2q_u32(row6.val[0], row7.val[0])}; + const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), + vtrn2q_u32(row6.val[1], row7.val[1])}; + + // Transpose 2x2 + const uint64x2x2_t k0_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))}; + const uint64x2x2_t k1_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))}; + const uint64x2x2_t k2_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))}; + const uint64x2x2_t k3_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))}; + const uint64x2x2_t k4_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))}; + const uint64x2x2_t k5_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))}; + const uint64x2x2_t k6_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))}; + const uint64x2x2_t k7_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))}; + + // Swap blocks + const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), + vreinterpretq_u32_u64(k4_u64.val[0])}; + const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), + vreinterpretq_u32_u64(k5_u64.val[0])}; + const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), + vreinterpretq_u32_u64(k4_u64.val[1])}; + const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), + vreinterpretq_u32_u64(k5_u64.val[1])}; + const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), + vreinterpretq_u32_u64(k6_u64.val[0])}; + const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), + vreinterpretq_u32_u64(k7_u64.val[0])}; + const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), + vreinterpretq_u32_u64(k6_u64.val[1])}; + const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), + vreinterpretq_u32_u64(k7_u64.val[1])}; + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Store + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + col0); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + col1); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + col2); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + col3); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + col4); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + col5); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + col6); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + col7); + } + + // Compute left-over elements (8x1) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); + + uint32x4_t result0 = vdupq_n_u32(0); + uint32x4_t result1 = vdupq_n_u32(0); + result0 = vsetq_lane_u32(val0, result0, 0); + result0 = vsetq_lane_u32(val1, result0, 1); + result0 = vsetq_lane_u32(val2, result0, 2); + result0 = vsetq_lane_u32(val3, result0, 3); + result1 = vsetq_lane_u32(val4, result1, 0); + result1 = vsetq_lane_u32(val5, result1, 1); + result1 = vsetq_lane_u32(val6, result1, 2); + result1 = vsetq_lane_u32(val7, result1, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1}); + } + }, + input, output); + } + + if (left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + + *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); + } +} +#else // __aarch64__ +void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if (left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if (window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if (in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint32x4_t row0 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4_t row1 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4_t row2 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4_t row3 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); + const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); + const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); + const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Swap block 01 with block 10 and store + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[0], k3_u32.val[0])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[1], k3_u32.val[1])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[0], k1_u32.val[0])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[1], k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint32x4_t result = vdupq_n_u32(0); + result = vsetq_lane_u32(val0, result, 0); + result = vsetq_lane_u32(val1, result, 1); + result = vsetq_lane_u32(val2, result, 2); + result = vsetq_lane_u32(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); + } + + if (left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + + *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); + } +} +#endif // __aarch64__ +} // namespace + +void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Destination auto inizialitation if not yet initialized + const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + // Explicitly set the tensor shape to preserve dimensions + dst->set_tensor_shape(dst_shape); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst)); + + // Note: This kernel performs 16 elements per iteration. + // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory + // For this reason num_elems_processed_per_iteration_x is set to 1 + const unsigned int num_elems_processed_per_iteration_x = 1; + const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size()); + + // Configure kernel window + Window win = + calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(dst->num_dimensions()); + dst->set_valid_region(ValidRegion(coord, dst->tensor_shape())); + + ICpuKernel::configure(win); +} + +Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + // Error if input is not 8 bit, 16bit or 32bit + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4, + "Element size not supported"); + + // Validate configured destination + if (dst->total_size() != 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch (src->info()->element_size()) + { + case 1: + transpose_8bit_elements(src, dst, window); + break; + case 2: + transpose_16bit_elements(src, dst, window); + break; + case 4: + transpose_32bit_elements(src, dst, window); + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} + +const char *CpuTransposeKernel::name() const +{ + return "CpuTransposeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h new file mode 100644 index 0000000000..e79a405677 --- /dev/null +++ b/src/cpu/kernels/CpuTransposeKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H +#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel which transposes the elements of a matrix */ +class CpuTransposeKernel : public ICpuKernel<CpuTransposeKernel> +{ +public: + CpuTransposeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Srouce tensor to permute. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuTransposeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..297ba63826 --- /dev/null +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuWeightsReshapeKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +TensorShape get_output_shape(const ITensorInfo *src, bool has_bias) +{ + TensorShape output_shape{src->tensor_shape()}; + + output_shape.collapse(3); + const size_t tmp_dim = output_shape[0]; + output_shape.set(0, output_shape[1]); + output_shape.set(1, tmp_dim + (has_bias ? 1 : 0)); + + return output_shape; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1)); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2)); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3])); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || + biases->dimension(1) != src->tensor_shape()[4])); + } + + // Checks performed when output is configured + if (dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + get_output_shape(src, biases != nullptr)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr)))); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst)); + + // Configure kernel + Window window = calculate_max_window(*src, Steps()); + window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0))); + window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1))); + window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2))); + ICpuKernel::configure(window); +} + +Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst)); + return Status{}; +} + +void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const unsigned int kernel_size_x = src->info()->dimension(0); + const unsigned int kernel_size_y = src->info()->dimension(1); + const unsigned int kernel_depth = src->info()->dimension(2); + const unsigned int input_stride_x = src->info()->strides_in_bytes().x(); + const unsigned int input_stride_y = src->info()->strides_in_bytes().y(); + const unsigned int input_stride_z = src->info()->strides_in_bytes().z(); + const unsigned int output_stride_y = dst->info()->strides_in_bytes().y(); + + // Create iterators + Iterator in(src, window); + execute_window_loop( + window, + [&](const Coordinates &id) + { + // Get column index + const int kernel_idx = id[3]; + const int kernel_idz = id[4]; + + // Setup pointers + const uint8_t *tmp_input_ptr = in.ptr(); + uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); + const uint8_t *curr_input_row_ptr = tmp_input_ptr; + const uint8_t *curr_input_depth_ptr = tmp_input_ptr; + + // Linearize volume + for (unsigned int d = 0; d < kernel_depth; ++d) + { + for (unsigned int j = 0; j < kernel_size_y; ++j) + { + for (unsigned int i = 0; i < kernel_size_x; ++i) + { + std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); + tmp_input_ptr += input_stride_x; + tmp_output_ptr += output_stride_y; + } + curr_input_row_ptr += input_stride_y; + tmp_input_ptr = curr_input_row_ptr; + } + curr_input_depth_ptr += input_stride_z; + curr_input_row_ptr = curr_input_depth_ptr; + tmp_input_ptr = curr_input_depth_ptr; + } + + // Add bias + if (biases != nullptr) + { + std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), + src->info()->element_size()); + } + }, + in); +} +const char *CpuWeightsReshapeKernel::name() const +{ + return "CpuWeightsReshapeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h new file mode 100644 index 0000000000..9310b3c784 --- /dev/null +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H +#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform reshaping on the weights used by convolution and locally connected layer + * + * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. + * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication. + * + * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: + * @f[ + * \left( \begin{array}{ccc} + * a000 & a001 & a002 \\ + * a010 & a011 & a012 \\ + * a020 & a021 & a022 \\ + * \end{array} \right) + * \left( \begin{array}{ccc} + * a100 & a101 & a102 \\ + * a110 & a111 & a112 \\ + * a120 & a121 & a122 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ + * \end{array} \right) + * @f] + */ +class CpuWeightsReshapeKernel : public ICpuKernel<CpuWeightsReshapeKernel> +{ +public: + /** Default constructor */ + CpuWeightsReshapeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. + * Data types supported: All + * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with + * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. + * @param[out] dst The output tensor info. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuWeightsReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp new file mode 100644 index 0000000000..52e3f2549c --- /dev/null +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} +{ +} + +void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(window); + const ITensor *input_nhwc = tensors.get_const_tensor(TensorType::ACL_SRC); + const ITensor *winograd_input_transform = tensors.get_const_tensor(TensorType::ACL_DST); + const ITensor *workspace = tensors.get_const_tensor(TensorType::ACL_INT); + + const unsigned int width_idx = 1; + const unsigned int height_idx = 2; + const unsigned int batch_idx = 3; + int element_size_in_bytes = input_nhwc->info()->element_size(); + const auto src_strides = input_nhwc->info()->strides_in_bytes(); + + const size_t input_row_stride = src_strides[height_idx] / element_size_in_bytes; + const size_t input_col_stride = src_strides[width_idx] / element_size_in_bytes; + const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes; + const auto input_nhwc_ptr = + reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); + auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() + + winograd_input_transform->info()->offset_first_element_in_bytes()); + + _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride, + input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec, + workspace->buffer(), info.thread_id, _nthreads); +} + +CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} +{ +} + +// Inherited methods overridden: +void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(window); + const ITensor *dst_nhwc = tensors.get_const_tensor(TensorType::ACL_DST); + const ITensor *winograd_output_transform = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *biases = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT); + + const unsigned int width_idx = 1; + const unsigned int height_idx = 2; + const unsigned int batch_idx = 3; + const int element_size_in_bytes = dst_nhwc->info()->element_size(); + const auto dst_strides = dst_nhwc->info()->strides_in_bytes(); + + const size_t out_row_stride = dst_strides[height_idx] / element_size_in_bytes; + const size_t out_col_stride = dst_strides[width_idx] / element_size_in_bytes; + const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes; + const auto wout_transf_ptr = reinterpret_cast<const void *>( + winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes()); + auto dst_nhwc_ptr = + reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes()); + void *biases_data_ptr = nullptr; + if (biases != nullptr) + { + biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); + } + + // Output transform + _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr, + dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride, + workspace->buffer(), info.thread_id, _nthreads); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h new file mode 100644 index 0000000000..8a3b745e85 --- /dev/null +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H +#define ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Steps.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/NEON/kernels/assembly/winograd.hpp" +#include "src/core/NEON/kernels/convolution/common/tensor.hpp" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuWinogradConv2dTransformInputKernel final : public ICpuKernel<CpuWinogradConv2dTransformInputKernel> +{ +public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformInputKernel(const CpuWinogradConv2dTransformInputKernel &) = delete; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformInputKernel &operator=(const CpuWinogradConv2dTransformInputKernel &) = delete; + + /** Prevent instances of this class from being moved it contains references.*/ + CpuWinogradConv2dTransformInputKernel(CpuWinogradConv2dTransformInputKernel &&) = delete; + + /** Prevent instances of this class from being moved it contains references.*/ + CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete; + + CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + const char *name() const override + { + return "CpuWinogradConv2dTransformInputKernel"; + } + +private: + arm_conv::winograd::WinogradImpl &_winograd_impl; + arm_conv::ConvolutionArgs &_conv_args; + uint32_t _nthreads; +}; +class CpuWinogradConv2dTransformOutputKernel : public ICpuKernel<CpuWinogradConv2dTransformOutputKernel> +{ +public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformOutputKernel(const CpuWinogradConv2dTransformOutputKernel &) = delete; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformOutputKernel &operator=(const CpuWinogradConv2dTransformOutputKernel &) = delete; + + /** Prevent instances of this class from being moved it contains references.*/ + CpuWinogradConv2dTransformOutputKernel(CpuWinogradConv2dTransformOutputKernel &&) = delete; + + /** Prevent instances of this class from being moved it contains references.*/ + CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete; + + CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + const char *name() const override + { + return "CpuWinogradConv2dTransformOutputKernel"; + } + +private: + arm_conv::winograd::WinogradImpl &_winograd_impl; + const arm_conv::ConvolutionArgs &_conv_args; + uint32_t _nthreads; +}; + +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H*/ diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp new file mode 100644 index 0000000000..ddc6dc24cd --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/activation/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8}; +} // namespace + +void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + fp_neon_activation_impl<float16_t, Fp16Params>(src, dst, act_info, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp new file mode 100644 index 0000000000..e558f8c73e --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/activation/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4}; +} // namespace +void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + fp_neon_activation_impl<float, Fp32Params>(src, dst, act_info, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h new file mode 100644 index 0000000000..afeb6f7f3d --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/impl.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/wrapper/wrapper.h" +namespace arm_compute +{ +namespace cpu +{ +/** Constant parameters needed by the activation implementation. + * These parameters differ for each floating type + * + * @note This are passed as a struct as C++ does not allow float as a template parameter until C++20 + **/ +struct ActFpImplParams +{ + float delta; /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */ + int step_x; /**< Window step at the x dimension */ +}; + +#ifndef __aarch64__ +inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask) +{ + auto int_in = vreinterpretq_u32_f32(in); + return vreinterpretq_f32_u32(wrapper::vand(int_in, mask)); +} +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask) +{ + auto int_in = vreinterpretq_u16_f16(in); + return vreinterpretq_f16_u16(wrapper::vand(int_in, mask)); +} +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#endif /* __aarch64__ */ + +template <typename T, const ActFpImplParams &P> +void fp_neon_activation_impl(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = + typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + constexpr int window_step_x = P.step_x; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + // In case of non-aarch64, a small delta value is added to the input + // to prevent NAN values caused by zeros in inputs to SQRT. + // In case of aarh64, we call vsqrt directly, so we don't use delta. +#ifndef __aarch64__ + const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{}); +#else /* #ifndef __aarch64__ */ + const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{}); + const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{}); +#endif /* __aarch64__ */ + const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); + const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{}); + const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{}); + const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{}); + constexpr float soft_relu_thresh = 12.f; + const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{}); + const auto va = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}); + const auto a = static_cast<T>(act_info.a()); + const auto b = static_cast<T>(act_info.b()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, + wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, + wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: +#ifdef __aarch64__ + tmp = wrapper::vsqrt(vin); +#else /* __aarch64__ */ + { + const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{})); + tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); + tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); + } +#endif /* __aarch64__ */ + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_6, + wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd( + const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin)))))); + break; +#ifdef __aarch64__ + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_2, + wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2))))); + break; +#endif /* __aarch64__ */ + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T in = *(reinterpret_cast<const T *>(input_ptr + x)); + T tmp; + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = std::max<T>(static_cast<T>(0), in); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min<T>(a, std::max<T>(b, in)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = (in >= 0) ? in : a * (std::exp(in) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = in; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = in / (static_cast<T>(1) + std::exp(-a * in)); + break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp new file mode 100644 index 0000000000..ddd186f9cb --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/lut.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/kernels/lut/list.h" + +namespace arm_compute +{ +namespace cpu +{ +#ifdef __aarch64__ +void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation + (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) || + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU); + const auto window_end_x = window.x().end(); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); +} +#endif // __aarch64__ +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..1451301ea2 --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +void neon_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in); + const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in); + const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); + +#ifndef __aarch64__ + const auto vconst_0_f32 = vdupq_n_f32(0); +#else // #ifndef __aarch64__ + const auto const_inv_2 = vdupq_n_f32(0.5f); + const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + +#ifndef __aarch64__ + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); +#endif // __aarch64__ + + // Initialise scale/offset for re-quantization + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + float32x4_t vs = vdupq_n_f32(s); + float32x4_t vo = vdupq_n_f32(o); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_u8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } +#endif // __aarch64__ + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + + const uint32x4x4_t pos_mask = {{ + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + }}; + + const float32x4x4_t tmp_dep = {{ + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + }}; + + tmp = vquantize(tmp_dep, qi_out); + } +#else // #ifndef __aarch64__ + else if (act == ActivationLayerInfo::ActivationFunction::GELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul(vin_deq.val[0], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[0], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[1], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[1], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[2], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[2], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[3], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[3], const_inv_sqrt_2))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } +#endif // __aarch64__ + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } +#endif // __aarch64__ + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::GELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } +#endif // __aarch64__ + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..a2f588245a --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void neon_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in); + const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in); + const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); + const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); +#ifndef __aarch64__ + const auto vconst_1 = vdupq_n_f32(1.f); + const auto vconst_0_f32 = vdupq_n_f32(0.f); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + float32x4_t vs = vdupq_n_f32(s); + float32x4_t vo = vdupq_n_f32(o); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_s8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } +#endif // __aarch64__ + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + +#ifdef __aarch64__ + const uint32x4x4_t pos_mask = {{ + wrapper::vcgtz(vin_deq.val[0]), + wrapper::vcgtz(vin_deq.val[1]), + wrapper::vcgtz(vin_deq.val[2]), + wrapper::vcgtz(vin_deq.val[3]), + }}; +#else // __aarch64__ + const uint32x4x4_t pos_mask = {{ + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + }}; +#endif // __aarch64__ + + const float32x4x4_t tmp_dep = {{ + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + }}; + + tmp = vquantize_signed(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } +#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } +#endif // __aarch64__ + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp new file mode 100644 index 0000000000..891646ea00 --- /dev/null +++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void neon_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + constexpr int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto vconst_1 = vdupq_n_f32(1.f); + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp; + ARM_COMPUTE_UNUSED(tmp); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])), + wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x)); + qsymm16_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp new file mode 100644 index 0000000000..19d9126556 --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2020-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/cpu/kernels/lut/list.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f16(1.f); + const auto const_0 = svdup_n_f16(0.f); + const auto const_6 = svdup_n_f16(6.f); + const auto const_3 = svdup_n_f16(3.f); + const auto const_inv_6 = svdup_n_f16(0.166666667f); + + const auto va = svdup_n_f16(act_info.a()); + const auto vb = svdup_n_f16(act_info.b()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + svfloat16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_f16(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f16_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f16_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), + svmax_f16_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, + svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f16_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f16_z( + pg, vin, + svmul_f16_z( + pg, const_inv_6, + svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f16_z( + pg, vin, + svinv_f16_z(pg, svadd_f16_z(pg, const_1, + svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); +} + +void sve_fp16_activation_lut(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::F16); + const auto window_start_x = window.x().start(); + const auto window_end_x = window.x().end(); + const auto size = window_end_x - window_start_x; + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + lut_u16_sve(reinterpret_cast<const uint16_t *>(act_info.lut_fp16().data()), 1U /* num_strings (UNUSED) */, + size, input_ptr + window_start_x, output_ptr + window_start_x); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp new file mode 100644 index 0000000000..d1b075d52c --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/SVEMath.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f32(1.f); + const auto const_0 = svdup_n_f32(0.f); + const auto const_6 = svdup_n_f32(6.f); + const auto const_3 = svdup_n_f32(3.f); + const auto const_inv_6 = svdup_n_f32(0.166666667f); + const auto soft_relu_thresh = svdup_n_f32(16.63553047f); + + const auto va = svdup_n_f32(act_info.a()); + const auto vb = svdup_n_f32(act_info.b()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + svfloat32_t tmp; + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + const auto vin = svld1_f32(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f32_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f32_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), + svmax_f32_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, + svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, + svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f32_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f32_z( + pg, vin, + svmul_f32_z( + pg, const_inv_6, + svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f32_z( + pg, vin, + svinv_f32_z(pg, svadd_f32_z(pg, const_1, + svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f32(pg, output_ptr + x, tmp); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp new file mode 100644 index 0000000000..5db8595a75 --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/kernels/lut/list.h" + +namespace arm_compute +{ +namespace cpu +{ +#ifdef __aarch64__ +void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + ARM_COMPUTE_ERROR_ON( // LUT does not provide any performance benefit for ReLU as it's a single max() operation + (src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED) || + act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU); + const auto window_end_x = window.x().end(); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = input.ptr(); + auto output_ptr = output.ptr(); + lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); +} +#endif // __aarch64__ +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp new file mode 100644 index 0000000000..7efa9e4b72 --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void sve2_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8(0.f, qi_in); + const auto vconst_0 = svdup_n_u8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + + // Initialise scale/offset for re-quantization + bool requant = true; + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + svuint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_u8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_u8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = svmla_qasymm8_z(pg, tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin)))); + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_u8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp new file mode 100644 index 0000000000..e4667522dd --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void sve2_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8_signed(0.f, qi_in); + const auto vconst_0 = svdup_n_s8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + const auto const_6_f32 = svdup_n_f32(6.f); + const auto const_0_f32 = svdup_n_f32(0.f); + const auto const_3_f32 = svdup_n_f32(3.f); + const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + bool requant = true; + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + svint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_s8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_s8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, svget4_f32(vin_deq, 0), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 0), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 1), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 2), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 3), + const_3_f32)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = + svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)), + svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin))); + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp new file mode 100644 index 0000000000..f955893307 --- /dev/null +++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/SVESymm.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void sve2_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + svint16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_s16(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1)))))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))), + svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h new file mode 100644 index 0000000000..8c24adc3fe --- /dev/null +++ b/src/cpu/kernels/activation/list.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H +#define ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ACTIVATION_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) + +#ifdef __aarch64__ +DECLARE_ACTIVATION_KERNEL(neon_q8_activation_lut); +#endif // __aarch64__ +DECLARE_ACTIVATION_KERNEL(sve2_q8_activation_lut); +DECLARE_ACTIVATION_KERNEL(neon_qasymm8_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_activation); +DECLARE_ACTIVATION_KERNEL(neon_qasymm8_signed_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qasymm8_signed_activation); +DECLARE_ACTIVATION_KERNEL(neon_qsymm16_activation); +DECLARE_ACTIVATION_KERNEL(sve2_qsymm16_activation); +DECLARE_ACTIVATION_KERNEL(sve_fp16_activation); +DECLARE_ACTIVATION_KERNEL(sve_fp16_activation_lut); +DECLARE_ACTIVATION_KERNEL(sve_fp32_activation); +DECLARE_ACTIVATION_KERNEL(neon_fp16_activation); +DECLARE_ACTIVATION_KERNEL(neon_fp32_activation); + +#undef DECLARE_ACTIVATION_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_LIST_H diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp new file mode 100644 index 0000000000..e7679c14e3 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_fp16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon<float16_t>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp new file mode 100644 index 0000000000..11a970bef4 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_fp32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon<float>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp new file mode 100644 index 0000000000..34938cc4c4 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -0,0 +1,723 @@ +/* + * Copyright (c) 2020-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/wrapper/wrapper.h" +namespace arm_compute +{ +namespace cpu +{ +bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, false); +} + +bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true); +} + +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition) +{ + const auto iq0 = src0->quantization_info().uniform(); + const auto iq1 = src1->quantization_info().uniform(); + const auto oq = dst->quantization_info().uniform(); + + const auto scale0 = iq0.scale / oq.scale; + const auto scale1 = iq1.scale / oq.scale; + + if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f) + { + // The scale factor cannot be stored as 5.11 signed fixed-point number. + return false; + } + + const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset); + + const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) + : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset)); + + if (max_acc > 1048575.f) // 2^20 - 1 + { + // It might not be possible to store the result as 21.11 signed fixed-point number. + return false; + } + + return true; +} + +template <typename ScalarType> +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/); +} + +template <typename ScalarType> +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) +{ + ARM_COMPUTE_UNUSED(policy); + + const auto in0_info = src0->info(); + const auto in1_info = src1->info(); + + const auto &in0_shape = in0_info->tensor_shape(); + const auto &in1_shape = in1_info->tensor_shape(); + + // Create input windows. + Window in0_win = window.broadcast_if_dimension_le_one(in0_shape); + Window in1_win = window.broadcast_if_dimension_le_one(in1_shape); + + // Clear the x dimension on the execution window as we process the whole row each iteration. + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16; + const auto window_start_x = window.x().start(); + const auto window_end_x = window.x().end(); + const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x(); + + const auto iq0_info = in0_info->quantization_info().uniform(); + const auto iq1_info = in1_info->quantization_info().uniform(); + const auto oq_info = dst->info()->quantization_info().uniform(); + const auto in0_scale = iq0_info.scale / oq_info.scale; + const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale)); + const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset); + + constexpr float _2pow11 = 2048; + const auto in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11)); + const auto in1_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in1_scale * _2pow11)); + const auto offset_21p11 = static_cast<int32_t>(support::cpp11::lround(offset * _2pow11)); + + constexpr uint8_t shift_amount_remainder = 3; + + if (is_broadcast_across_x) + { + // Prefix: a = non-broadcast, b = broadcast. + + const auto is_broadcast_input_1 = in1_win.x().step() == 0; + auto a_win = is_broadcast_input_1 ? in0_win : in1_win; + auto b_win = is_broadcast_input_1 ? in1_win : in0_win; + const auto a_tensor = is_broadcast_input_1 ? src0 : src1; + const auto b_tensor = is_broadcast_input_1 ? src1 : src0; + + const auto a_scale_5p11 = is_broadcast_input_1 ? in0_scale_5p11 : in1_scale_5p11; + const auto b_scale = is_broadcast_input_1 ? in1_scale : in0_scale; + const auto a_vscale_5p11 = wrapper::vdup_n(a_scale_5p11, wrapper::traits::vector_64_tag()); + +#ifndef __aarch64__ + const auto a_scale = is_broadcast_input_1 ? in0_scale : in1_scale; +#endif // __aarch64__ + + // Clear the x dimension on the execution window as we process the whole row each iteration. + a_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator a_input_it(a_tensor, a_win); + Iterator b_input_it(b_tensor, b_win); + Iterator out_it(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + const auto b_val = *b_ptr; + const auto b_scaled = b_scale * b_val; + const auto b_scaled_21p11 = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11)); + const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11; + const auto b_vscaled_offseted_21p11 = + wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag()); + +#ifndef __aarch64__ + const auto b_scaled_offseted = b_scaled + offset; +#endif // __aarch64__ + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the input. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset. + // Widen and store the result in 32-bit integer. + const auto vout_21p11_00 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { +#ifdef __aarch64__ + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>( + int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11)); +#else // __aarch64__ + out_ptr[x] = utility::clamp<int, ScalarType>( + support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted)); +#endif // __aarch64__ + } + }, + b_input_it, a_input_it, out_it); + } + else + { + const auto vscale0_5p11 = wrapper::vdup_n(in0_scale_5p11, wrapper::traits::vector_64_tag()); + const auto vscale1_5p11 = wrapper::vdup_n(in1_scale_5p11, wrapper::traits::vector_64_tag()); + const auto voffset_21p11 = wrapper::vdup_n(offset_21p11, wrapper::traits::vector_128_tag()); + + // Clear the x dimension on the execution window as we process the whole row each iteration. + in0_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + in1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in0_it(src0, in0_win); + Iterator in1_it(src1, in1_win); + Iterator out_it(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + // Multiply the input elements by the scale factor and add the offset. + // Widen and store the result in 32-bit integer. + const auto vscaled0_offseted_21p11_00 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_01 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_10 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11); + const auto vscaled0_offseted_21p11_11 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11); + + const auto vout_21p11_00 = + wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { +#ifdef __aarch64__ + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>( + int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11)); +#else // __aarch64__ + out_ptr[x] = utility::clamp<int, ScalarType>( + support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset)); +#endif // __aarch64__ + } + }, + in0_it, in1_it, out_it); + } +} + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto scale1 = iq1_info.scale / oq_info.scale; + const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); + const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + const auto af_scale = is_broadcast_input_2 ? scale1 : scale2; + const auto bf_scale = is_broadcast_input_2 ? scale2 : scale1; + const auto vscale1 = vdupq_n_f32(af_scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = non_broadcast_input.ptr(); + const auto output_ptr = output.ptr(); + + const auto broadcast_value = *broadcast_input.ptr(); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); +#endif //__aarch64__ + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; +#ifdef __aarch64__ + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); +#else // __aarch64__ + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); +#endif // __aarch64__ + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = vdupq_n_f32(scale1); + const auto vscale2 = vdupq_n_f32(scale2); + const auto voffset = vdupq_n_f32(offset); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = input1.ptr(); + const auto input2_ptr = input2.ptr(); + const auto output_ptr = output.ptr(); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + const auto b_u16_0 = vmovl_u8(vget_low_u8(b)); + const auto b_u16_1 = vmovl_u8(vget_high_u8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); +#endif //__aarch64__ + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; +#ifdef __aarch64__ + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); +#else // __aarch64__ + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); +#endif // __aarch64__ + } + }, + input1, input2, output); + } +} + +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto scale1 = iq1_info.scale / oq_info.scale; + const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); + const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + const auto af_scale = is_broadcast_input_2 ? scale1 : scale2; + const auto bf_scale = is_broadcast_input_2 ? scale2 : scale1; + const auto vscale1 = vdupq_n_f32(af_scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); + + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); +#endif //__aarch64__ + + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; +#ifdef __aarch64__ + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); +#else // __aarch64__ + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); +#endif // __aarch64__ + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = vdupq_n_f32(scale1); + const auto vscale2 = vdupq_n_f32(scale2); + const auto voffset = vdupq_n_f32(offset); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(input1_ptr + x); + const int8x16_t b = vld1q_s8(input2_ptr + x); + + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + const auto b_s16_0 = vmovl_s8(vget_low_s8(b)); + const auto b_s16_1 = vmovl_s8(vget_high_s8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); +#endif //__aarch64__ + + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; +#ifdef __aarch64__ + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); +#else // __aarch64__ + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); +#endif // __aarch64__ + } + }, + input1, input2, output); + } +} + +template void add_q8_neon_fixedpoint<int8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_q8_neon_fixedpoint<uint8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h new file mode 100644 index 0000000000..faa99baffe --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ADD_IMPL_H +#define SRC_CORE_NEON_KERNELS_ADD_IMPL_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>; + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) + ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) + : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) + ? wrapper::add_sat(broadcast_value, non_broadcast_v) + : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = + (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = + (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); + } +} + +bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +template <typename ScalarType> +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template <typename ScalarType> +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp new file mode 100644 index 0000000000..f0bcebc9d2 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_u8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon<uint8_t>(src0, src1, dst, policy, window); +} + +void add_s16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon<int16_t>(src0, src1, dst, policy, window); +} + +void add_s32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_neon<int32_t>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..8195d229d9 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..7e23096239 --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp new file mode 100644 index 0000000000..ac2de0557a --- /dev/null +++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#endif //__aarch64__ + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#endif //__aarch64__ + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp new file mode 100644 index 0000000000..01dfe6c44b --- /dev/null +++ b/src/cpu/kernels/add/generic/sve/fp16.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/add/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_fp16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_sve<float16_t>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp new file mode 100644 index 0000000000..56771a5411 --- /dev/null +++ b/src/cpu/kernels/add/generic/sve/fp32.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" + +#include "src/cpu/kernels/add/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_fp32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_sve<float>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp new file mode 100644 index 0000000000..ca850fcef4 --- /dev/null +++ b/src/cpu/kernels/add/generic/sve/impl.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/add/generic/sve/impl.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include <arm_sve.h> +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const bool is_sat = (policy == ConvertPolicy::SATURATE); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) + : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +template void add_same_sve<float>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<uint8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int16_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int32_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +template void add_same_sve<float16_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h new file mode 100644 index 0000000000..6a95d66826 --- /dev/null +++ b/src/cpu/kernels/add/generic/sve/impl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H +#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp new file mode 100644 index 0000000000..4d17f2adbd --- /dev/null +++ b/src/cpu/kernels/add/generic/sve/integer.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" + +#include "src/cpu/kernels/add/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_u8_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_sve<uint8_t>(src0, src1, dst, policy, window); +} + +void add_s16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_sve<int16_t>(src0, src1, dst, policy, window); +} + +void add_s32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + return add_same_sve<int32_t>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp new file mode 100644 index 0000000000..40add9d51b --- /dev/null +++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const auto all_true_pg = svptrue_b8(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto voffseto = svdup_n_f32(oq_info.offset); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); + const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); + const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); + const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); + + do + { + const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto voffset1 = svdup_n_s32(iq1_info.offset); + const auto voffset2 = svdup_n_s32(iq2_info.offset); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_u8(pg, input1_ptr + x); + const auto b = svld1_u8(pg, input2_ptr + x); + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), + vscale2); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp new file mode 100644 index 0000000000..2e585115e1 --- /dev/null +++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_signed_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto voffseto = svdup_n_f32(oq_info.offset); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const auto all_true_pg = svptrue_b8(); + + const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); + const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); + const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); + const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + + do + { + const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto voffset1 = svdup_n_s32(iq1_info.offset); + const auto voffset2 = svdup_n_s32(iq2_info.offset); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_s8(pg, input1_ptr + x); + const auto b = svld1_s8(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp new file mode 100644 index 0000000000..17a42c2138 --- /dev/null +++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qsymm16_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto all_true_pg = svptrue_b16(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s16(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); + + do + { + const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + auto a = svld1_s16(pg, input1_ptr + x); + auto b = svld1_s16(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h new file mode 100644 index 0000000000..1040c39a41 --- /dev/null +++ b/src/cpu/kernels/add/list.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ADD_LIST_H +#define SRC_CORE_KERNELS_ADD_LIST_H + +#include "src/cpu/kernels/add/generic/neon/impl.h" +#include "src/cpu/kernels/add/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ADD_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) + +DECLARE_ADD_KERNEL(add_qasymm8_neon); +DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); +DECLARE_ADD_KERNEL(add_qsymm16_neon); +DECLARE_ADD_KERNEL(add_fp32_neon); +DECLARE_ADD_KERNEL(add_fp16_neon); +DECLARE_ADD_KERNEL(add_u8_neon); +DECLARE_ADD_KERNEL(add_s16_neon); +DECLARE_ADD_KERNEL(add_s32_neon); +DECLARE_ADD_KERNEL(add_fp32_sve); +DECLARE_ADD_KERNEL(add_fp16_sve); +DECLARE_ADD_KERNEL(add_u8_sve); +DECLARE_ADD_KERNEL(add_s16_sve); +DECLARE_ADD_KERNEL(add_s32_sve); +DECLARE_ADD_KERNEL(add_qasymm8_sve2); +DECLARE_ADD_KERNEL(add_qasymm8_signed_sve2); +DECLARE_ADD_KERNEL(add_qsymm16_sve2); + +#undef DECLARE_ADD_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ADD_LIST_H diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp new file mode 100644 index 0000000000..b4b81aa78b --- /dev/null +++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/CpuTypes.h" + +#include <cstddef> +#include <cstdint> +#include <limits> + +#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +namespace +{ +using arm_compute::float16_t; + +void a64_add_bn_clamp_direct_fp16_2x32(float16_t *out, + size_t out_stride, + float16_t *out_direct, + size_t out_direct_stride, + const float16_t *in0, + size_t in0_stride, + const float16_t *in1, + size_t in1_stride, + const float16_t *bn_mul, + const float16_t *bn_add, + const float16_t minval, + const float16_t maxval, + size_t width, + size_t height) +{ + struct KernelArgs + { + float16_t minval; + float16_t maxval; + } ka; + ka.minval = minval; + ka.maxval = maxval; + + __asm__ __volatile__( + "ldr w21, [%x[args_ptr], %[offsetof_minval]]\n" + "ldr w20, [%x[args_ptr], %[offsetof_maxval]]\n" + "cmp %x[width], #0x20\n" + "dup v13.8h, w21\n" + "dup v12.8h, w20\n" + "blt 7f\n" + "1:" // Column loop + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x12, %x[in0]\n" + "mov x11, %x[in1]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x10, %x[out]\n" + "mov x9, %x[out_direct]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x20, %x[height]\n" + "mov x28, x12\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "mov x27, x11\n" + "mov x26, x10\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "mov x25, x9\n" + "add x24, x28, %x[in0_stride]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q8, [x27, #0x10]\n" + "add x23, x27, %x[in1_stride]\n" + "add x22, x26, %x[out_stride]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x27, #0x20]\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "ldr q5, [x28, #0x30]\n" + "ldr q4, [x27, #0x30]\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "add x10, x22, %x[out_stride]\n" + "add x9, x21, %x[out_direct_stride]\n" + "csel x24, x24, x28, GE\n" + "csel x23, x23, x27, GE\n" + "csel x22, x22, x26, GE\n" + "csel x21, x21, x25, GE\n" + "subs x20, x20, #0x2\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "add x28, x28, #0x40\n" + "add x27, x27, #0x40\n" + "ble 4f\n" + "2:" // Row loop + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "fadd v2.8h, v11.8h, v10.8h\n" + "fadd v1.8h, v9.8h, v8.8h\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "fadd v0.8h, v7.8h, v6.8h\n" + "fadd v23.8h, v5.8h, v4.8h\n" + "ldr q19, [x24, #0x20]\n" + "ldr q18, [x23, #0x20]\n" + "fadd v22.8h, v3.8h, v22.8h\n" + "fadd v21.8h, v21.8h, v20.8h\n" + "ldr q17, [x24, #0x30]\n" + "ldr q16, [x23, #0x30]\n" + "fadd v20.8h, v19.8h, v18.8h\n" + "fadd v19.8h, v17.8h, v16.8h\n" + "add x24, x24, #0x40\n" + "add x23, x23, #0x40\n" + "cbz %x[out_direct], 3f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "str q0, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x25, x25, #0x40\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q20, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "add x21, x21, #0x40\n" + "3:" // Main loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.8h, v16.8h, v24.8h\n" + "mov x28, x12\n" + "ldr q11, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "mov v18.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q5, [x28, #0x30]\n" + "mov v17.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v1.8h, v18.8h, v25.8h\n" + "mov x27, x11\n" + "ldr q10, [x27, #0x0]\n" + "ldr q8, [x27, #0x10]\n" + "fmla v0.8h, v17.8h, v26.8h\n" + "fmla v23.8h, v16.8h, v27.8h\n" + "ldr q6, [x27, #0x20]\n" + "ldr q4, [x27, #0x30]\n" + "mov v17.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v22.8h, v17.8h, v24.8h\n" + "mov x25, x9\n" + "mov v17.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v21.8h, v16.8h, v25.8h\n" + "add x24, x28, %x[in0_stride]\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v20.8h, v17.8h, v26.8h\n" + "add x23, x27, %x[in1_stride]\n" + "fmla v19.8h, v16.8h, v27.8h\n" + "fmin v2.8h, v2.8h, v12.8h\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "fmin v1.8h, v1.8h, v12.8h\n" + "fmin v0.8h, v0.8h, v12.8h\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "fmin v23.8h, v23.8h, v12.8h\n" + "fmax v2.8h, v2.8h, v13.8h\n" + "str q2, [x26, #0x0]\n" + "add x9, x21, %x[out_direct_stride]\n" + "fmax v1.8h, v1.8h, v13.8h\n" + "fmax v0.8h, v0.8h, v13.8h\n" + "str q1, [x26, #0x10]\n" + "csel x24, x24, x28, GE\n" + "fmax v23.8h, v23.8h, v13.8h\n" + "fmin v22.8h, v22.8h, v12.8h\n" + "str q0, [x26, #0x20]\n" + "csel x23, x23, x27, GE\n" + "fmin v21.8h, v21.8h, v12.8h\n" + "fmin v20.8h, v20.8h, v12.8h\n" + "str q23, [x26, #0x30]\n" + "mov x26, x10\n" + "fmin v19.8h, v19.8h, v12.8h\n" + "fmax v22.8h, v22.8h, v13.8h\n" + "str q22, [x22, #0x0]\n" + "csel x21, x21, x25, GE\n" + "fmax v21.8h, v21.8h, v13.8h\n" + "fmax v20.8h, v20.8h, v13.8h\n" + "str q21, [x22, #0x10]\n" + "add x28, x28, #0x40\n" + "fmax v19.8h, v19.8h, v13.8h\n" + "str q20, [x22, #0x20]\n" + "add x27, x27, #0x40\n" + "str q19, [x22, #0x30]\n" + "add x22, x26, %x[out_stride]\n" + "add x10, x22, %x[out_stride]\n" + "csel x22, x22, x26, GE\n" + "subs x20, x20, #0x2\n" + "bgt 2b\n" + "4:" // Row loop skip + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "fadd v2.8h, v11.8h, v10.8h\n" + "fadd v1.8h, v9.8h, v8.8h\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "fadd v0.8h, v7.8h, v6.8h\n" + "fadd v23.8h, v5.8h, v4.8h\n" + "ldr q19, [x24, #0x20]\n" + "ldr q18, [x23, #0x20]\n" + "fadd v22.8h, v3.8h, v22.8h\n" + "fadd v21.8h, v21.8h, v20.8h\n" + "ldr q17, [x24, #0x30]\n" + "ldr q16, [x23, #0x30]\n" + "fadd v20.8h, v19.8h, v18.8h\n" + "fadd v19.8h, v17.8h, v16.8h\n" + "add x24, x24, #0x40\n" + "add x23, x23, #0x40\n" + "cbz %x[out_direct], 5f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "str q0, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x25, x25, #0x40\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q20, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "add x21, x21, #0x40\n" + "5:" // Tail loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.8h, v16.8h, v24.8h\n" + "add %x[in0], %x[in0], #0x40\n" + "mov v16.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "fmla v1.8h, v16.8h, v25.8h\n" + "add %x[in1], %x[in1], #0x40\n" + "mov v16.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "fmla v0.8h, v16.8h, v26.8h\n" + "add %x[out], %x[out], #0x40\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v23.8h, v16.8h, v27.8h\n" + "mov v16.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "fmla v22.8h, v16.8h, v24.8h\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v21.8h, v16.8h, v25.8h\n" + "mov v16.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v20.8h, v16.8h, v26.8h\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v19.8h, v16.8h, v27.8h\n" + "fmin v2.8h, v2.8h, v12.8h\n" + "fmin v1.8h, v1.8h, v12.8h\n" + "fmin v0.8h, v0.8h, v12.8h\n" + "fmin v23.8h, v23.8h, v12.8h\n" + "fmin v22.8h, v22.8h, v12.8h\n" + "fmin v21.8h, v21.8h, v12.8h\n" + "fmin v20.8h, v20.8h, v12.8h\n" + "fmin v19.8h, v19.8h, v12.8h\n" + "fmax v2.8h, v2.8h, v13.8h\n" + "fmax v1.8h, v1.8h, v13.8h\n" + "str q2, [x26, #0x0]\n" + "fmax v0.8h, v0.8h, v13.8h\n" + "fmax v23.8h, v23.8h, v13.8h\n" + "str q1, [x26, #0x10]\n" + "fmax v22.8h, v22.8h, v13.8h\n" + "fmax v21.8h, v21.8h, v13.8h\n" + "str q0, [x26, #0x20]\n" + "fmax v20.8h, v20.8h, v13.8h\n" + "fmax v19.8h, v19.8h, v13.8h\n" + "str q23, [x26, #0x30]\n" + "add x26, x26, #0x40\n" + "str q22, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q20, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "add x22, x22, #0x40\n" + "cbz %x[out_direct], 6f\n" + "add %x[out_direct], %x[out_direct], #0x40\n" + "6:" // No direct pointer update + "sub %x[width], %x[width], #0x20\n" + "cmp %x[width], #0x20\n" + "bge 1b\n" + "cbz %x[width], 58f\n" + "7:" // main loop skip + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x20, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "8:" // tail loop: Row loop + "mov x28, x12\n" + "mov x27, x11\n" + "mov x26, x10\n" + "mov x25, x9\n" + "add x24, x28, %x[in0_stride]\n" + "add x23, x27, %x[in1_stride]\n" + "add x22, x26, %x[out_stride]\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "add x10, x22, %x[out_stride]\n" + "add x9, x21, %x[out_direct_stride]\n" + "csel x24, x24, x28, GE\n" + "csel x23, x23, x27, GE\n" + "csel x22, x22, x26, GE\n" + "csel x21, x21, x25, GE\n" + "tbz %x[width], #4, 16f\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q8, [x27, #0x10]\n" + "add x28, x28, #0x20\n" + "add x27, x27, #0x20\n" + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "add x24, x24, #0x20\n" + "add x23, x23, #0x20\n" + "tbz %x[width], #3, 12f\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x27, #0x0]\n" + "add x28, x28, #0x10\n" + "add x27, x27, #0x10\n" + "ldr q19, [x24, #0x0]\n" + "ldr q18, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "tbz %x[width], #2, 10f\n" + "ldr d5, [x28], #0x8\n" + "ldr d4, [x27], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "tbz %x[width], #1, 9f\n" + "ld1 { v5.s }[2], [x28], #0x4\n" + "ld1 { v4.s }[2], [x27], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v5.h }[6], [x28], #0x2\n" + "ld1 { v4.h }[6], [x27], #0x2\n" + "ld1 { v17.h }[6], [x24], #0x2\n" + "ld1 { v16.h }[6], [x23], #0x2\n" + "b 24f\n" + "9:" // tail loop: unique 1: partial_0_28 + "tbz %x[width], #0, 24f\n" + "ld1 { v5.h }[4], [x28], #0x2\n" + "ld1 { v4.h }[4], [x27], #0x2\n" + "ld1 { v17.h }[4], [x24], #0x2\n" + "ld1 { v16.h }[4], [x23], #0x2\n" + "b 24f\n" + "10:" // tail loop: unique 1: partial_1_24 + "tbz %x[width], #1, 11f\n" + "ldr s5, [x28], #0x4\n" + "ldr s4, [x27], #0x4\n" + "ldr s17, [x24], #0x4\n" + "ldr s16, [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v5.h }[2], [x28], #0x2\n" + "ld1 { v4.h }[2], [x27], #0x2\n" + "ld1 { v17.h }[2], [x24], #0x2\n" + "ld1 { v16.h }[2], [x23], #0x2\n" + "b 24f\n" + "11:" // tail loop: unique 1: partial_0_24 + "tbz %x[width], #0, 24f\n" + "ldr h5, [x28], #0x2\n" + "ldr h4, [x27], #0x2\n" + "ldr h17, [x24], #0x2\n" + "ldr h16, [x23], #0x2\n" + "b 24f\n" + "12:" // tail loop: unique 1: partial_2_16 + "tbz %x[width], #2, 14f\n" + "ldr d7, [x28], #0x8\n" + "ldr d6, [x27], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "tbz %x[width], #1, 13f\n" + "ld1 { v7.s }[2], [x28], #0x4\n" + "ld1 { v6.s }[2], [x27], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v7.h }[6], [x28], #0x2\n" + "ld1 { v6.h }[6], [x27], #0x2\n" + "ld1 { v19.h }[6], [x24], #0x2\n" + "ld1 { v18.h }[6], [x23], #0x2\n" + "b 24f\n" + "13:" // tail loop: unique 1: partial_0_20 + "tbz %x[width], #0, 24f\n" + "ld1 { v7.h }[4], [x28], #0x2\n" + "ld1 { v6.h }[4], [x27], #0x2\n" + "ld1 { v19.h }[4], [x24], #0x2\n" + "ld1 { v18.h }[4], [x23], #0x2\n" + "b 24f\n" + "14:" // tail loop: unique 1: partial_1_16 + "tbz %x[width], #1, 15f\n" + "ldr s7, [x28], #0x4\n" + "ldr s6, [x27], #0x4\n" + "ldr s19, [x24], #0x4\n" + "ldr s18, [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v7.h }[2], [x28], #0x2\n" + "ld1 { v6.h }[2], [x27], #0x2\n" + "ld1 { v19.h }[2], [x24], #0x2\n" + "ld1 { v18.h }[2], [x23], #0x2\n" + "b 24f\n" + "15:" // tail loop: unique 1: partial_0_16 + "tbz %x[width], #0, 24f\n" + "ldr h7, [x28], #0x2\n" + "ldr h6, [x27], #0x2\n" + "ldr h19, [x24], #0x2\n" + "ldr h18, [x23], #0x2\n" + "b 24f\n" + "16:" // tail loop: unique 1: partial_3_0 + "tbz %x[width], #3, 20f\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "add x28, x28, #0x10\n" + "add x27, x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "tbz %x[width], #2, 18f\n" + "ldr d9, [x28], #0x8\n" + "ldr d8, [x27], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "tbz %x[width], #1, 17f\n" + "ld1 { v9.s }[2], [x28], #0x4\n" + "ld1 { v8.s }[2], [x27], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v9.h }[6], [x28], #0x2\n" + "ld1 { v8.h }[6], [x27], #0x2\n" + "ld1 { v21.h }[6], [x24], #0x2\n" + "ld1 { v20.h }[6], [x23], #0x2\n" + "b 24f\n" + "17:" // tail loop: unique 1: partial_0_12 + "tbz %x[width], #0, 24f\n" + "ld1 { v9.h }[4], [x28], #0x2\n" + "ld1 { v8.h }[4], [x27], #0x2\n" + "ld1 { v21.h }[4], [x24], #0x2\n" + "ld1 { v20.h }[4], [x23], #0x2\n" + "b 24f\n" + "18:" // tail loop: unique 1: partial_1_8 + "tbz %x[width], #1, 19f\n" + "ldr s9, [x28], #0x4\n" + "ldr s8, [x27], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v9.h }[2], [x28], #0x2\n" + "ld1 { v8.h }[2], [x27], #0x2\n" + "ld1 { v21.h }[2], [x24], #0x2\n" + "ld1 { v20.h }[2], [x23], #0x2\n" + "b 24f\n" + "19:" // tail loop: unique 1: partial_0_8 + "tbz %x[width], #0, 24f\n" + "ldr h9, [x28], #0x2\n" + "ldr h8, [x27], #0x2\n" + "ldr h21, [x24], #0x2\n" + "ldr h20, [x23], #0x2\n" + "b 24f\n" + "20:" // tail loop: unique 1: partial_2_0 + "tbz %x[width], #2, 22f\n" + "ldr d11, [x28], #0x8\n" + "ldr d10, [x27], #0x8\n" + "ldr d3, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz %x[width], #1, 21f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v10.s }[2], [x27], #0x4\n" + "ld1 { v3.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v11.h }[6], [x28], #0x2\n" + "ld1 { v10.h }[6], [x27], #0x2\n" + "ld1 { v3.h }[6], [x24], #0x2\n" + "ld1 { v22.h }[6], [x23], #0x2\n" + "b 24f\n" + "21:" // tail loop: unique 1: partial_0_4 + "tbz %x[width], #0, 24f\n" + "ld1 { v11.h }[4], [x28], #0x2\n" + "ld1 { v10.h }[4], [x27], #0x2\n" + "ld1 { v3.h }[4], [x24], #0x2\n" + "ld1 { v22.h }[4], [x23], #0x2\n" + "b 24f\n" + "22:" // tail loop: unique 1: partial_1_0 + "tbz %x[width], #1, 23f\n" + "ldr s11, [x28], #0x4\n" + "ldr s10, [x27], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "tbz %x[width], #0, 24f\n" + "ld1 { v11.h }[2], [x28], #0x2\n" + "ld1 { v10.h }[2], [x27], #0x2\n" + "ld1 { v3.h }[2], [x24], #0x2\n" + "ld1 { v22.h }[2], [x23], #0x2\n" + "b 24f\n" + "23:" // tail loop: unique 1: partial_0_0 + "ldr h11, [x28], #0x2\n" + "ldr h10, [x27], #0x2\n" + "ldr h3, [x24], #0x2\n" + "ldr h22, [x23], #0x2\n" + "24:" // tail loop: unique 1: Done + "fadd v2.8h, v11.8h, v10.8h\n" + "fadd v1.8h, v9.8h, v8.8h\n" + "fadd v0.8h, v7.8h, v6.8h\n" + "fadd v23.8h, v5.8h, v4.8h\n" + "fadd v22.8h, v3.8h, v22.8h\n" + "fadd v21.8h, v21.8h, v20.8h\n" + "fadd v20.8h, v19.8h, v18.8h\n" + "fadd v19.8h, v17.8h, v16.8h\n" + "cbz %x[out_direct], 41f\n" + "tbz %x[width], #4, 32f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "add x25, x25, #0x20\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "add x21, x21, #0x20\n" + "tbz %x[width], #3, 28f\n" + "str q0, [x25, #0x0]\n" + "add x25, x25, #0x10\n" + "str q20, [x21, #0x0]\n" + "add x21, x21, #0x10\n" + "tbz %x[width], #2, 26f\n" + "str d23, [x25], #0x8\n" + "str d19, [x21], #0x8\n" + "tbz %x[width], #1, 25f\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v23.h }[6], [x25], #0x2\n" + "st1 { v19.h }[6], [x21], #0x2\n" + "b 40f\n" + "25:" // tail loop: Main loop: unique 2: partial_0_28 + "tbz %x[width], #0, 40f\n" + "st1 { v23.h }[4], [x25], #0x2\n" + "st1 { v19.h }[4], [x21], #0x2\n" + "b 40f\n" + "26:" // tail loop: Main loop: unique 2: partial_1_24 + "tbz %x[width], #1, 27f\n" + "str s23, [x25], #0x4\n" + "str s19, [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v23.h }[2], [x25], #0x2\n" + "st1 { v19.h }[2], [x21], #0x2\n" + "b 40f\n" + "27:" // tail loop: Main loop: unique 2: partial_0_24 + "tbz %x[width], #0, 40f\n" + "str h23, [x25], #0x2\n" + "str h19, [x21], #0x2\n" + "b 40f\n" + "28:" // tail loop: Main loop: unique 2: partial_2_16 + "tbz %x[width], #2, 30f\n" + "str d0, [x25], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz %x[width], #1, 29f\n" + "st1 { v0.s }[2], [x25], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v0.h }[6], [x25], #0x2\n" + "st1 { v20.h }[6], [x21], #0x2\n" + "b 40f\n" + "29:" // tail loop: Main loop: unique 2: partial_0_20 + "tbz %x[width], #0, 40f\n" + "st1 { v0.h }[4], [x25], #0x2\n" + "st1 { v20.h }[4], [x21], #0x2\n" + "b 40f\n" + "30:" // tail loop: Main loop: unique 2: partial_1_16 + "tbz %x[width], #1, 31f\n" + "str s0, [x25], #0x4\n" + "str s20, [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v0.h }[2], [x25], #0x2\n" + "st1 { v20.h }[2], [x21], #0x2\n" + "b 40f\n" + "31:" // tail loop: Main loop: unique 2: partial_0_16 + "tbz %x[width], #0, 40f\n" + "str h0, [x25], #0x2\n" + "str h20, [x21], #0x2\n" + "b 40f\n" + "32:" // tail loop: Main loop: unique 2: partial_3_0 + "tbz %x[width], #3, 36f\n" + "str q2, [x25, #0x0]\n" + "add x25, x25, #0x10\n" + "str q22, [x21, #0x0]\n" + "add x21, x21, #0x10\n" + "tbz %x[width], #2, 34f\n" + "str d1, [x25], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz %x[width], #1, 33f\n" + "st1 { v1.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v1.h }[6], [x25], #0x2\n" + "st1 { v21.h }[6], [x21], #0x2\n" + "b 40f\n" + "33:" // tail loop: Main loop: unique 2: partial_0_12 + "tbz %x[width], #0, 40f\n" + "st1 { v1.h }[4], [x25], #0x2\n" + "st1 { v21.h }[4], [x21], #0x2\n" + "b 40f\n" + "34:" // tail loop: Main loop: unique 2: partial_1_8 + "tbz %x[width], #1, 35f\n" + "str s1, [x25], #0x4\n" + "str s21, [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v1.h }[2], [x25], #0x2\n" + "st1 { v21.h }[2], [x21], #0x2\n" + "b 40f\n" + "35:" // tail loop: Main loop: unique 2: partial_0_8 + "tbz %x[width], #0, 40f\n" + "str h1, [x25], #0x2\n" + "str h21, [x21], #0x2\n" + "b 40f\n" + "36:" // tail loop: Main loop: unique 2: partial_2_0 + "tbz %x[width], #2, 38f\n" + "str d2, [x25], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz %x[width], #1, 37f\n" + "st1 { v2.s }[2], [x25], #0x4\n" + "st1 { v22.s }[2], [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v2.h }[6], [x25], #0x2\n" + "st1 { v22.h }[6], [x21], #0x2\n" + "b 40f\n" + "37:" // tail loop: Main loop: unique 2: partial_0_4 + "tbz %x[width], #0, 40f\n" + "st1 { v2.h }[4], [x25], #0x2\n" + "st1 { v22.h }[4], [x21], #0x2\n" + "b 40f\n" + "38:" // tail loop: Main loop: unique 2: partial_1_0 + "tbz %x[width], #1, 39f\n" + "str s2, [x25], #0x4\n" + "str s22, [x21], #0x4\n" + "tbz %x[width], #0, 40f\n" + "st1 { v2.h }[2], [x25], #0x2\n" + "st1 { v22.h }[2], [x21], #0x2\n" + "b 40f\n" + "39:" // tail loop: Main loop: unique 2: partial_0_0 + "str h2, [x25], #0x2\n" + "str h22, [x21], #0x2\n" + "40:" // tail loop: Main loop: unique 2: Done + "41:" // tail loop: Main loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.8h, v16.8h, v24.8h\n" + "mov v16.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "fmla v1.8h, v16.8h, v25.8h\n" + "mov v16.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "fmla v0.8h, v16.8h, v26.8h\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v23.8h, v16.8h, v27.8h\n" + "mov v16.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "fmla v22.8h, v16.8h, v24.8h\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v21.8h, v16.8h, v25.8h\n" + "mov v16.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v20.8h, v16.8h, v26.8h\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v19.8h, v16.8h, v27.8h\n" + "fmin v2.8h, v2.8h, v12.8h\n" + "fmin v1.8h, v1.8h, v12.8h\n" + "fmin v0.8h, v0.8h, v12.8h\n" + "fmin v23.8h, v23.8h, v12.8h\n" + "fmin v22.8h, v22.8h, v12.8h\n" + "fmin v21.8h, v21.8h, v12.8h\n" + "fmin v20.8h, v20.8h, v12.8h\n" + "fmin v19.8h, v19.8h, v12.8h\n" + "fmax v2.8h, v2.8h, v13.8h\n" + "fmax v1.8h, v1.8h, v13.8h\n" + "fmax v0.8h, v0.8h, v13.8h\n" + "fmax v23.8h, v23.8h, v13.8h\n" + "fmax v22.8h, v22.8h, v13.8h\n" + "fmax v21.8h, v21.8h, v13.8h\n" + "fmax v20.8h, v20.8h, v13.8h\n" + "fmax v19.8h, v19.8h, v13.8h\n" + "tbz %x[width], #4, 49f\n" + "str q2, [x26, #0x0]\n" + "str q1, [x26, #0x10]\n" + "add x26, x26, #0x20\n" + "str q22, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "add x22, x22, #0x20\n" + "tbz %x[width], #3, 45f\n" + "str q0, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x22, #0x0]\n" + "add x22, x22, #0x10\n" + "tbz %x[width], #2, 43f\n" + "str d23, [x26], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz %x[width], #1, 42f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v23.h }[6], [x26], #0x2\n" + "st1 { v19.h }[6], [x22], #0x2\n" + "b 57f\n" + "42:" // tail loop: unique 3: partial_0_28 + "tbz %x[width], #0, 57f\n" + "st1 { v23.h }[4], [x26], #0x2\n" + "st1 { v19.h }[4], [x22], #0x2\n" + "b 57f\n" + "43:" // tail loop: unique 3: partial_1_24 + "tbz %x[width], #1, 44f\n" + "str s23, [x26], #0x4\n" + "str s19, [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v23.h }[2], [x26], #0x2\n" + "st1 { v19.h }[2], [x22], #0x2\n" + "b 57f\n" + "44:" // tail loop: unique 3: partial_0_24 + "tbz %x[width], #0, 57f\n" + "str h23, [x26], #0x2\n" + "str h19, [x22], #0x2\n" + "b 57f\n" + "45:" // tail loop: unique 3: partial_2_16 + "tbz %x[width], #2, 47f\n" + "str d0, [x26], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz %x[width], #1, 46f\n" + "st1 { v0.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v0.h }[6], [x26], #0x2\n" + "st1 { v20.h }[6], [x22], #0x2\n" + "b 57f\n" + "46:" // tail loop: unique 3: partial_0_20 + "tbz %x[width], #0, 57f\n" + "st1 { v0.h }[4], [x26], #0x2\n" + "st1 { v20.h }[4], [x22], #0x2\n" + "b 57f\n" + "47:" // tail loop: unique 3: partial_1_16 + "tbz %x[width], #1, 48f\n" + "str s0, [x26], #0x4\n" + "str s20, [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v0.h }[2], [x26], #0x2\n" + "st1 { v20.h }[2], [x22], #0x2\n" + "b 57f\n" + "48:" // tail loop: unique 3: partial_0_16 + "tbz %x[width], #0, 57f\n" + "str h0, [x26], #0x2\n" + "str h20, [x22], #0x2\n" + "b 57f\n" + "49:" // tail loop: unique 3: partial_3_0 + "tbz %x[width], #3, 53f\n" + "str q2, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q22, [x22, #0x0]\n" + "add x22, x22, #0x10\n" + "tbz %x[width], #2, 51f\n" + "str d1, [x26], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz %x[width], #1, 50f\n" + "st1 { v1.s }[2], [x26], #0x4\n" + "st1 { v21.s }[2], [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v1.h }[6], [x26], #0x2\n" + "st1 { v21.h }[6], [x22], #0x2\n" + "b 57f\n" + "50:" // tail loop: unique 3: partial_0_12 + "tbz %x[width], #0, 57f\n" + "st1 { v1.h }[4], [x26], #0x2\n" + "st1 { v21.h }[4], [x22], #0x2\n" + "b 57f\n" + "51:" // tail loop: unique 3: partial_1_8 + "tbz %x[width], #1, 52f\n" + "str s1, [x26], #0x4\n" + "str s21, [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v1.h }[2], [x26], #0x2\n" + "st1 { v21.h }[2], [x22], #0x2\n" + "b 57f\n" + "52:" // tail loop: unique 3: partial_0_8 + "tbz %x[width], #0, 57f\n" + "str h1, [x26], #0x2\n" + "str h21, [x22], #0x2\n" + "b 57f\n" + "53:" // tail loop: unique 3: partial_2_0 + "tbz %x[width], #2, 55f\n" + "str d2, [x26], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz %x[width], #1, 54f\n" + "st1 { v2.s }[2], [x26], #0x4\n" + "st1 { v22.s }[2], [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v2.h }[6], [x26], #0x2\n" + "st1 { v22.h }[6], [x22], #0x2\n" + "b 57f\n" + "54:" // tail loop: unique 3: partial_0_4 + "tbz %x[width], #0, 57f\n" + "st1 { v2.h }[4], [x26], #0x2\n" + "st1 { v22.h }[4], [x22], #0x2\n" + "b 57f\n" + "55:" // tail loop: unique 3: partial_1_0 + "tbz %x[width], #1, 56f\n" + "str s2, [x26], #0x4\n" + "str s22, [x22], #0x4\n" + "tbz %x[width], #0, 57f\n" + "st1 { v2.h }[2], [x26], #0x2\n" + "st1 { v22.h }[2], [x22], #0x2\n" + "b 57f\n" + "56:" // tail loop: unique 3: partial_0_0 + "str h2, [x26], #0x2\n" + "str h22, [x22], #0x2\n" + "57:" // tail loop: unique 3: Done + "subs x20, x20, #0x2\n" + "bgt 8b\n" + "58:" // odd columns skip + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} + +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void add_mul_add_fp16_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + const size_t out_stride = final_output->info()->strides_in_bytes()[1]; + const size_t out_direct_stride = (add_output != nullptr) ? add_output->info()->strides_in_bytes()[1] : 0; + const size_t in0_stride = input1->info()->strides_in_bytes()[1]; + const size_t in1_stride = input2->info()->strides_in_bytes()[1]; + + float16_t minval = std::numeric_limits<half>::lowest(); + float16_t maxval = std::numeric_limits<half>::max(); + + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + minval = static_cast<float16_t>(0.f); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + minval = static_cast<float16_t>(0.f); + maxval = static_cast<float16_t>(act_info.a()); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + minval = static_cast<float16_t>(act_info.b()); + maxval = static_cast<float16_t>(act_info.a()); + } + + // Clear X & Y dimensions on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in1_it(input1, window); + Iterator in2_it(input2, window); + Iterator out_it(final_output, window); + + const size_t width = window.num_iterations(0); + const size_t height = window.num_iterations(1); + + if (add_output != nullptr) + { + Iterator add_out_it(add_output, window); + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, + reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, add_out_it, out_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr, + out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()), + in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, out_it); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp new file mode 100644 index 0000000000..f0444b6acd --- /dev/null +++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp @@ -0,0 +1,733 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include <cstddef> +#include <cstdint> +#include <limits> + +#ifdef __aarch64__ +namespace +{ +void a64_add_bn_clamp_direct_fp32_2x16(float *out, + size_t out_stride, + float *out_direct, + size_t out_direct_stride, + const float *in0, + size_t in0_stride, + const float *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const float minval, + const float maxval, + size_t width, + size_t height) +{ + struct KernelArgs + { + float minval; + float maxval; + } ka; + ka.minval = minval; + ka.maxval = maxval; + + __asm__ __volatile__( + "ldr w21, [%x[args_ptr], %[offsetof_minval]]\n" + "ldr w20, [%x[args_ptr], %[offsetof_maxval]]\n" + "cmp %x[width], #0x10\n" + "dup v13.4s, w21\n" + "dup v12.4s, w20\n" + "blt 7f\n" + "1:" // Column loop + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x12, %x[in0]\n" + "mov x11, %x[in1]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x10, %x[out]\n" + "mov x9, %x[out_direct]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x20, %x[height]\n" + "mov x28, x12\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "mov x27, x11\n" + "mov x26, x10\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "mov x25, x9\n" + "add x24, x28, %x[in0_stride]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q8, [x27, #0x10]\n" + "add x23, x27, %x[in1_stride]\n" + "add x22, x26, %x[out_stride]\n" + "ldr q7, [x28, #0x20]\n" + "ldr q6, [x27, #0x20]\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "ldr q5, [x28, #0x30]\n" + "ldr q4, [x27, #0x30]\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "add x10, x22, %x[out_stride]\n" + "add x9, x21, %x[out_direct_stride]\n" + "csel x24, x24, x28, GE\n" + "csel x23, x23, x27, GE\n" + "csel x22, x22, x26, GE\n" + "csel x21, x21, x25, GE\n" + "subs x20, x20, #0x2\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "add x28, x28, #0x40\n" + "add x27, x27, #0x40\n" + "ble 4f\n" + "2:" // Row loop + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "fadd v2.4s, v11.4s, v10.4s\n" + "fadd v1.4s, v9.4s, v8.4s\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "fadd v0.4s, v7.4s, v6.4s\n" + "fadd v23.4s, v5.4s, v4.4s\n" + "ldr q19, [x24, #0x20]\n" + "ldr q18, [x23, #0x20]\n" + "fadd v22.4s, v3.4s, v22.4s\n" + "fadd v21.4s, v21.4s, v20.4s\n" + "ldr q17, [x24, #0x30]\n" + "ldr q16, [x23, #0x30]\n" + "fadd v20.4s, v19.4s, v18.4s\n" + "fadd v19.4s, v17.4s, v16.4s\n" + "add x24, x24, #0x40\n" + "add x23, x23, #0x40\n" + "cbz %x[out_direct], 3f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "str q0, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x25, x25, #0x40\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q20, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "add x21, x21, #0x40\n" + "3:" // Main loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.4s, v16.4s, v24.4s\n" + "mov x28, x12\n" + "ldr q11, [x28, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "mov v18.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "ldr q7, [x28, #0x20]\n" + "ldr q5, [x28, #0x30]\n" + "mov v17.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v1.4s, v18.4s, v25.4s\n" + "mov x27, x11\n" + "ldr q10, [x27, #0x0]\n" + "ldr q8, [x27, #0x10]\n" + "fmla v0.4s, v17.4s, v26.4s\n" + "fmla v23.4s, v16.4s, v27.4s\n" + "ldr q6, [x27, #0x20]\n" + "ldr q4, [x27, #0x30]\n" + "mov v17.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v22.4s, v17.4s, v24.4s\n" + "mov x25, x9\n" + "mov v17.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v21.4s, v16.4s, v25.4s\n" + "add x24, x28, %x[in0_stride]\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v20.4s, v17.4s, v26.4s\n" + "add x23, x27, %x[in1_stride]\n" + "fmla v19.4s, v16.4s, v27.4s\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "fmin v1.4s, v1.4s, v12.4s\n" + "fmin v0.4s, v0.4s, v12.4s\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "fmin v23.4s, v23.4s, v12.4s\n" + "fmax v2.4s, v2.4s, v13.4s\n" + "str q2, [x26, #0x0]\n" + "add x9, x21, %x[out_direct_stride]\n" + "fmax v1.4s, v1.4s, v13.4s\n" + "fmax v0.4s, v0.4s, v13.4s\n" + "str q1, [x26, #0x10]\n" + "csel x24, x24, x28, GE\n" + "fmax v23.4s, v23.4s, v13.4s\n" + "fmin v22.4s, v22.4s, v12.4s\n" + "str q0, [x26, #0x20]\n" + "csel x23, x23, x27, GE\n" + "fmin v21.4s, v21.4s, v12.4s\n" + "fmin v20.4s, v20.4s, v12.4s\n" + "str q23, [x26, #0x30]\n" + "mov x26, x10\n" + "fmin v19.4s, v19.4s, v12.4s\n" + "fmax v22.4s, v22.4s, v13.4s\n" + "str q22, [x22, #0x0]\n" + "csel x21, x21, x25, GE\n" + "fmax v21.4s, v21.4s, v13.4s\n" + "fmax v20.4s, v20.4s, v13.4s\n" + "str q21, [x22, #0x10]\n" + "add x28, x28, #0x40\n" + "fmax v19.4s, v19.4s, v13.4s\n" + "str q20, [x22, #0x20]\n" + "add x27, x27, #0x40\n" + "str q19, [x22, #0x30]\n" + "add x22, x26, %x[out_stride]\n" + "add x10, x22, %x[out_stride]\n" + "csel x22, x22, x26, GE\n" + "subs x20, x20, #0x2\n" + "bgt 2b\n" + "4:" // Row loop skip + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "fadd v2.4s, v11.4s, v10.4s\n" + "fadd v1.4s, v9.4s, v8.4s\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "fadd v0.4s, v7.4s, v6.4s\n" + "fadd v23.4s, v5.4s, v4.4s\n" + "ldr q19, [x24, #0x20]\n" + "ldr q18, [x23, #0x20]\n" + "fadd v22.4s, v3.4s, v22.4s\n" + "fadd v21.4s, v21.4s, v20.4s\n" + "ldr q17, [x24, #0x30]\n" + "ldr q16, [x23, #0x30]\n" + "fadd v20.4s, v19.4s, v18.4s\n" + "fadd v19.4s, v17.4s, v16.4s\n" + "add x24, x24, #0x40\n" + "add x23, x23, #0x40\n" + "cbz %x[out_direct], 5f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "str q0, [x25, #0x20]\n" + "str q23, [x25, #0x30]\n" + "add x25, x25, #0x40\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "str q20, [x21, #0x20]\n" + "str q19, [x21, #0x30]\n" + "add x21, x21, #0x40\n" + "5:" // Tail loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.4s, v16.4s, v24.4s\n" + "add %x[in0], %x[in0], #0x40\n" + "mov v16.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "fmla v1.4s, v16.4s, v25.4s\n" + "add %x[in1], %x[in1], #0x40\n" + "mov v16.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "fmla v0.4s, v16.4s, v26.4s\n" + "add %x[out], %x[out], #0x40\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v23.4s, v16.4s, v27.4s\n" + "mov v16.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "fmla v22.4s, v16.4s, v24.4s\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v21.4s, v16.4s, v25.4s\n" + "mov v16.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v20.4s, v16.4s, v26.4s\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v19.4s, v16.4s, v27.4s\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "fmin v1.4s, v1.4s, v12.4s\n" + "fmin v0.4s, v0.4s, v12.4s\n" + "fmin v23.4s, v23.4s, v12.4s\n" + "fmin v22.4s, v22.4s, v12.4s\n" + "fmin v21.4s, v21.4s, v12.4s\n" + "fmin v20.4s, v20.4s, v12.4s\n" + "fmin v19.4s, v19.4s, v12.4s\n" + "fmax v2.4s, v2.4s, v13.4s\n" + "fmax v1.4s, v1.4s, v13.4s\n" + "str q2, [x26, #0x0]\n" + "fmax v0.4s, v0.4s, v13.4s\n" + "fmax v23.4s, v23.4s, v13.4s\n" + "str q1, [x26, #0x10]\n" + "fmax v22.4s, v22.4s, v13.4s\n" + "fmax v21.4s, v21.4s, v13.4s\n" + "str q0, [x26, #0x20]\n" + "fmax v20.4s, v20.4s, v13.4s\n" + "fmax v19.4s, v19.4s, v13.4s\n" + "str q23, [x26, #0x30]\n" + "add x26, x26, #0x40\n" + "str q22, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "str q20, [x22, #0x20]\n" + "str q19, [x22, #0x30]\n" + "add x22, x22, #0x40\n" + "cbz %x[out_direct], 6f\n" + "add %x[out_direct], %x[out_direct], #0x40\n" + "6:" // No direct pointer update + "sub %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "bge 1b\n" + "cbz %x[width], 34f\n" + "7:" // main loop skip + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x20, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "8:" // tail loop: Row loop + "mov x28, x12\n" + "mov x27, x11\n" + "mov x26, x10\n" + "mov x25, x9\n" + "add x24, x28, %x[in0_stride]\n" + "add x23, x27, %x[in1_stride]\n" + "add x22, x26, %x[out_stride]\n" + "add x21, x25, %x[out_direct_stride]\n" + "cmp x20, #0x2\n" + "add x12, x24, %x[in0_stride]\n" + "add x11, x23, %x[in1_stride]\n" + "add x10, x22, %x[out_stride]\n" + "add x9, x21, %x[out_direct_stride]\n" + "csel x24, x24, x28, GE\n" + "csel x23, x23, x27, GE\n" + "csel x22, x22, x26, GE\n" + "csel x21, x21, x25, GE\n" + "tbz %x[width], #3, 12f\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "ldr q9, [x28, #0x10]\n" + "ldr q8, [x27, #0x10]\n" + "add x28, x28, #0x20\n" + "add x27, x27, #0x20\n" + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "ldr q21, [x24, #0x10]\n" + "ldr q20, [x23, #0x10]\n" + "add x24, x24, #0x20\n" + "add x23, x23, #0x20\n" + "tbz %x[width], #2, 10f\n" + "ldr q7, [x28, #0x0]\n" + "ldr q6, [x27, #0x0]\n" + "add x28, x28, #0x10\n" + "add x27, x27, #0x10\n" + "ldr q19, [x24, #0x0]\n" + "ldr q18, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "tbz %x[width], #1, 9f\n" + "ldr d5, [x28], #0x8\n" + "ldr d4, [x27], #0x8\n" + "ldr d17, [x24], #0x8\n" + "ldr d16, [x23], #0x8\n" + "tbz %x[width], #0, 16f\n" + "ld1 { v5.s }[2], [x28], #0x4\n" + "ld1 { v4.s }[2], [x27], #0x4\n" + "ld1 { v17.s }[2], [x24], #0x4\n" + "ld1 { v16.s }[2], [x23], #0x4\n" + "b 16f\n" + "9:" // tail loop: unique 1: partial_0_12 + "tbz %x[width], #0, 16f\n" + "ldr s5, [x28], #0x4\n" + "ldr s4, [x27], #0x4\n" + "ldr s17, [x24], #0x4\n" + "ldr s16, [x23], #0x4\n" + "b 16f\n" + "10:" // tail loop: unique 1: partial_1_8 + "tbz %x[width], #1, 11f\n" + "ldr d7, [x28], #0x8\n" + "ldr d6, [x27], #0x8\n" + "ldr d19, [x24], #0x8\n" + "ldr d18, [x23], #0x8\n" + "tbz %x[width], #0, 16f\n" + "ld1 { v7.s }[2], [x28], #0x4\n" + "ld1 { v6.s }[2], [x27], #0x4\n" + "ld1 { v19.s }[2], [x24], #0x4\n" + "ld1 { v18.s }[2], [x23], #0x4\n" + "b 16f\n" + "11:" // tail loop: unique 1: partial_0_8 + "tbz %x[width], #0, 16f\n" + "ldr s7, [x28], #0x4\n" + "ldr s6, [x27], #0x4\n" + "ldr s19, [x24], #0x4\n" + "ldr s18, [x23], #0x4\n" + "b 16f\n" + "12:" // tail loop: unique 1: partial_2_0 + "tbz %x[width], #2, 14f\n" + "ldr q11, [x28, #0x0]\n" + "ldr q10, [x27, #0x0]\n" + "add x28, x28, #0x10\n" + "add x27, x27, #0x10\n" + "ldr q3, [x24, #0x0]\n" + "ldr q22, [x23, #0x0]\n" + "add x24, x24, #0x10\n" + "add x23, x23, #0x10\n" + "tbz %x[width], #1, 13f\n" + "ldr d9, [x28], #0x8\n" + "ldr d8, [x27], #0x8\n" + "ldr d21, [x24], #0x8\n" + "ldr d20, [x23], #0x8\n" + "tbz %x[width], #0, 16f\n" + "ld1 { v9.s }[2], [x28], #0x4\n" + "ld1 { v8.s }[2], [x27], #0x4\n" + "ld1 { v21.s }[2], [x24], #0x4\n" + "ld1 { v20.s }[2], [x23], #0x4\n" + "b 16f\n" + "13:" // tail loop: unique 1: partial_0_4 + "tbz %x[width], #0, 16f\n" + "ldr s9, [x28], #0x4\n" + "ldr s8, [x27], #0x4\n" + "ldr s21, [x24], #0x4\n" + "ldr s20, [x23], #0x4\n" + "b 16f\n" + "14:" // tail loop: unique 1: partial_1_0 + "tbz %x[width], #1, 15f\n" + "ldr d11, [x28], #0x8\n" + "ldr d10, [x27], #0x8\n" + "ldr d3, [x24], #0x8\n" + "ldr d22, [x23], #0x8\n" + "tbz %x[width], #0, 16f\n" + "ld1 { v11.s }[2], [x28], #0x4\n" + "ld1 { v10.s }[2], [x27], #0x4\n" + "ld1 { v3.s }[2], [x24], #0x4\n" + "ld1 { v22.s }[2], [x23], #0x4\n" + "b 16f\n" + "15:" // tail loop: unique 1: partial_0_0 + "ldr s11, [x28], #0x4\n" + "ldr s10, [x27], #0x4\n" + "ldr s3, [x24], #0x4\n" + "ldr s22, [x23], #0x4\n" + "16:" // tail loop: unique 1: Done + "fadd v2.4s, v11.4s, v10.4s\n" + "fadd v1.4s, v9.4s, v8.4s\n" + "fadd v0.4s, v7.4s, v6.4s\n" + "fadd v23.4s, v5.4s, v4.4s\n" + "fadd v22.4s, v3.4s, v22.4s\n" + "fadd v21.4s, v21.4s, v20.4s\n" + "fadd v20.4s, v19.4s, v18.4s\n" + "fadd v19.4s, v17.4s, v16.4s\n" + "cbz %x[out_direct], 25f\n" + "tbz %x[width], #3, 20f\n" + "str q2, [x25, #0x0]\n" + "str q1, [x25, #0x10]\n" + "add x25, x25, #0x20\n" + "str q22, [x21, #0x0]\n" + "str q21, [x21, #0x10]\n" + "add x21, x21, #0x20\n" + "tbz %x[width], #2, 18f\n" + "str q0, [x25, #0x0]\n" + "add x25, x25, #0x10\n" + "str q20, [x21, #0x0]\n" + "add x21, x21, #0x10\n" + "tbz %x[width], #1, 17f\n" + "str d23, [x25], #0x8\n" + "str d19, [x21], #0x8\n" + "tbz %x[width], #0, 24f\n" + "st1 { v23.s }[2], [x25], #0x4\n" + "st1 { v19.s }[2], [x21], #0x4\n" + "b 24f\n" + "17:" // tail loop: Main loop: unique 2: partial_0_12 + "tbz %x[width], #0, 24f\n" + "str s23, [x25], #0x4\n" + "str s19, [x21], #0x4\n" + "b 24f\n" + "18:" // tail loop: Main loop: unique 2: partial_1_8 + "tbz %x[width], #1, 19f\n" + "str d0, [x25], #0x8\n" + "str d20, [x21], #0x8\n" + "tbz %x[width], #0, 24f\n" + "st1 { v0.s }[2], [x25], #0x4\n" + "st1 { v20.s }[2], [x21], #0x4\n" + "b 24f\n" + "19:" // tail loop: Main loop: unique 2: partial_0_8 + "tbz %x[width], #0, 24f\n" + "str s0, [x25], #0x4\n" + "str s20, [x21], #0x4\n" + "b 24f\n" + "20:" // tail loop: Main loop: unique 2: partial_2_0 + "tbz %x[width], #2, 22f\n" + "str q2, [x25, #0x0]\n" + "add x25, x25, #0x10\n" + "str q22, [x21, #0x0]\n" + "add x21, x21, #0x10\n" + "tbz %x[width], #1, 21f\n" + "str d1, [x25], #0x8\n" + "str d21, [x21], #0x8\n" + "tbz %x[width], #0, 24f\n" + "st1 { v1.s }[2], [x25], #0x4\n" + "st1 { v21.s }[2], [x21], #0x4\n" + "b 24f\n" + "21:" // tail loop: Main loop: unique 2: partial_0_4 + "tbz %x[width], #0, 24f\n" + "str s1, [x25], #0x4\n" + "str s21, [x21], #0x4\n" + "b 24f\n" + "22:" // tail loop: Main loop: unique 2: partial_1_0 + "tbz %x[width], #1, 23f\n" + "str d2, [x25], #0x8\n" + "str d22, [x21], #0x8\n" + "tbz %x[width], #0, 24f\n" + "st1 { v2.s }[2], [x25], #0x4\n" + "st1 { v22.s }[2], [x21], #0x4\n" + "b 24f\n" + "23:" // tail loop: Main loop: unique 2: partial_0_0 + "str s2, [x25], #0x4\n" + "str s22, [x21], #0x4\n" + "24:" // tail loop: Main loop: unique 2: Done + "25:" // tail loop: Main loop: No direct output + "mov v16.16b, v2.16b\n" + "mov v2.16b, v28.16b\n" + "fmla v2.4s, v16.4s, v24.4s\n" + "mov v16.16b, v1.16b\n" + "mov v1.16b, v29.16b\n" + "fmla v1.4s, v16.4s, v25.4s\n" + "mov v16.16b, v0.16b\n" + "mov v0.16b, v30.16b\n" + "fmla v0.4s, v16.4s, v26.4s\n" + "mov v16.16b, v23.16b\n" + "mov v23.16b, v31.16b\n" + "fmla v23.4s, v16.4s, v27.4s\n" + "mov v16.16b, v22.16b\n" + "mov v22.16b, v28.16b\n" + "fmla v22.4s, v16.4s, v24.4s\n" + "mov v16.16b, v21.16b\n" + "mov v21.16b, v29.16b\n" + "fmla v21.4s, v16.4s, v25.4s\n" + "mov v16.16b, v20.16b\n" + "mov v20.16b, v30.16b\n" + "fmla v20.4s, v16.4s, v26.4s\n" + "mov v16.16b, v19.16b\n" + "mov v19.16b, v31.16b\n" + "fmla v19.4s, v16.4s, v27.4s\n" + "fmin v2.4s, v2.4s, v12.4s\n" + "fmin v1.4s, v1.4s, v12.4s\n" + "fmin v0.4s, v0.4s, v12.4s\n" + "fmin v23.4s, v23.4s, v12.4s\n" + "fmin v22.4s, v22.4s, v12.4s\n" + "fmin v21.4s, v21.4s, v12.4s\n" + "fmin v20.4s, v20.4s, v12.4s\n" + "fmin v19.4s, v19.4s, v12.4s\n" + "fmax v2.4s, v2.4s, v13.4s\n" + "fmax v1.4s, v1.4s, v13.4s\n" + "fmax v0.4s, v0.4s, v13.4s\n" + "fmax v23.4s, v23.4s, v13.4s\n" + "fmax v22.4s, v22.4s, v13.4s\n" + "fmax v21.4s, v21.4s, v13.4s\n" + "fmax v20.4s, v20.4s, v13.4s\n" + "fmax v19.4s, v19.4s, v13.4s\n" + "tbz %x[width], #3, 29f\n" + "str q2, [x26, #0x0]\n" + "str q1, [x26, #0x10]\n" + "add x26, x26, #0x20\n" + "str q22, [x22, #0x0]\n" + "str q21, [x22, #0x10]\n" + "add x22, x22, #0x20\n" + "tbz %x[width], #2, 27f\n" + "str q0, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q20, [x22, #0x0]\n" + "add x22, x22, #0x10\n" + "tbz %x[width], #1, 26f\n" + "str d23, [x26], #0x8\n" + "str d19, [x22], #0x8\n" + "tbz %x[width], #0, 33f\n" + "st1 { v23.s }[2], [x26], #0x4\n" + "st1 { v19.s }[2], [x22], #0x4\n" + "b 33f\n" + "26:" // tail loop: unique 3: partial_0_12 + "tbz %x[width], #0, 33f\n" + "str s23, [x26], #0x4\n" + "str s19, [x22], #0x4\n" + "b 33f\n" + "27:" // tail loop: unique 3: partial_1_8 + "tbz %x[width], #1, 28f\n" + "str d0, [x26], #0x8\n" + "str d20, [x22], #0x8\n" + "tbz %x[width], #0, 33f\n" + "st1 { v0.s }[2], [x26], #0x4\n" + "st1 { v20.s }[2], [x22], #0x4\n" + "b 33f\n" + "28:" // tail loop: unique 3: partial_0_8 + "tbz %x[width], #0, 33f\n" + "str s0, [x26], #0x4\n" + "str s20, [x22], #0x4\n" + "b 33f\n" + "29:" // tail loop: unique 3: partial_2_0 + "tbz %x[width], #2, 31f\n" + "str q2, [x26, #0x0]\n" + "add x26, x26, #0x10\n" + "str q22, [x22, #0x0]\n" + "add x22, x22, #0x10\n" + "tbz %x[width], #1, 30f\n" + "str d1, [x26], #0x8\n" + "str d21, [x22], #0x8\n" + "tbz %x[width], #0, 33f\n" + "st1 { v1.s }[2], [x26], #0x4\n" + "st1 { v21.s }[2], [x22], #0x4\n" + "b 33f\n" + "30:" // tail loop: unique 3: partial_0_4 + "tbz %x[width], #0, 33f\n" + "str s1, [x26], #0x4\n" + "str s21, [x22], #0x4\n" + "b 33f\n" + "31:" // tail loop: unique 3: partial_1_0 + "tbz %x[width], #1, 32f\n" + "str d2, [x26], #0x8\n" + "str d22, [x22], #0x8\n" + "tbz %x[width], #0, 33f\n" + "st1 { v2.s }[2], [x26], #0x4\n" + "st1 { v22.s }[2], [x22], #0x4\n" + "b 33f\n" + "32:" // tail loop: unique 3: partial_0_0 + "str s2, [x26], #0x4\n" + "str s22, [x22], #0x4\n" + "33:" // tail loop: unique 3: Done + "subs x20, x20, #0x2\n" + "bgt 8b\n" + "34:" // odd columns skip + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void add_mul_add_fp32_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + const size_t out_stride = final_output->info()->strides_in_bytes()[1]; + const size_t out_direct_stride = (add_output != nullptr) ? add_output->info()->strides_in_bytes()[1] : 0; + const size_t in0_stride = input1->info()->strides_in_bytes()[1]; + const size_t in1_stride = input2->info()->strides_in_bytes()[1]; + + float minval = std::numeric_limits<float>::lowest(); + float maxval = std::numeric_limits<float>::max(); + + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + minval = 0.f; + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + minval = 0.f; + maxval = act_info.a(); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + minval = act_info.b(); + maxval = act_info.a(); + } + + // Clear X & Y dimensions on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in1_it(input1, window); + Iterator in2_it(input2, window); + Iterator out_it(final_output, window); + + const size_t width = window.num_iterations(0); + const size_t height = window.num_iterations(1); + + if (add_output != nullptr) + { + Iterator add_out_it(add_output, window); + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, add_out_it, out_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()), + in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, out_it); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // __aarch64__ diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..035805c944 --- /dev/null +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp @@ -0,0 +1,846 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/QuantizationInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include <cstddef> +#include <cstdint> +#include <limits> + +#ifdef __aarch64__ +namespace +{ +void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t *out, + size_t out_stride, + uint8_t *out_direct, + size_t out_direct_stride, + const uint8_t *in0, + size_t in0_stride, + const uint8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const uint8_t minval, + const uint8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) +{ + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; + struct KernelArgs + { + const float *scales; + int32_t in0_zeropt; + int32_t in1_zeropt; + int32_t out_zeropt; + int32_t out_direct_zeropt; + int32_t minval; + int32_t maxval; + } ka; + ka.scales = scales; + ka.in0_zeropt = in0_zeropt; + ka.in1_zeropt = in1_zeropt; + ka.out_zeropt = out_zeropt; + ka.out_direct_zeropt = out_direct_zeropt; + ka.minval = minval; + ka.maxval = maxval; + + __asm__ __volatile__( + "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n" + "ld1 { v0.4s }, [x20]\n" + "cmp %x[width], #0x10\n" + "blt 5f\n" + "1:" // Column loop + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x23, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "2:" // Row loop + "mov x28, x12\n" + "ldr d4, [x28, #0x0]\n" + "ldr d3, [x28, #0x8]\n" + "add x21, x28, %x[in0_stride]\n" + "mov x27, x11\n" + "ldr d13, [x27, #0x0]\n" + "ldr d12, [x27, #0x8]\n" + "cmp x23, #0x2\n" + "add x12, x21, %x[in0_stride]\n" + "csel x21, x21, x28, GE\n" + "ldr d2, [x21, #0x0]\n" + "ldr d11, [x21, #0x8]\n" + "add x20, x27, %x[in1_stride]\n" + "add x11, x20, %x[in1_stride]\n" + "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n" + "ushll v4.8h, v4.8b, #0x0\n" + "csel x20, x20, x27, GE\n" + "ldr d10, [x20, #0x0]\n" + "ldr d9, [x20, #0x8]\n" + "ushll v3.8h, v3.8b, #0x0\n" + "ushll v2.8h, v2.8b, #0x0\n" + "ushll v11.8h, v11.8b, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n" + "mov x26, x10\n" + "dup v16.8h, w21\n" + "ushll v13.8h, v13.8b, #0x0\n" + "mov x25, x9\n" + "add x24, x26, %x[out_stride]\n" + "ushll v12.8h, v12.8b, #0x0\n" + "ushll v10.8h, v10.8b, #0x0\n" + "add x22, x25, %x[out_direct_stride]\n" + "add x10, x24, %x[out_stride]\n" + "ushll v9.8h, v9.8b, #0x0\n" + "ssubl v1.4s, v4.4h, v16.4h\n" + "add x9, x22, %x[out_direct_stride]\n" + "csel x24, x24, x26, GE\n" + "ssubl2 v4.4s, v4.8h, v16.8h\n" + "ssubl v23.4s, v3.4h, v16.4h\n" + "csel x22, x22, x25, GE\n" + "ssubl2 v3.4s, v3.8h, v16.8h\n" + "ssubl v22.4s, v2.4h, v16.4h\n" + "ssubl2 v2.4s, v2.8h, v16.8h\n" + "ssubl v21.4s, v11.4h, v16.4h\n" + "ssubl2 v11.4s, v11.8h, v16.8h\n" + "dup v20.8h, w20\n" + "ssubl v19.4s, v13.4h, v20.4h\n" + "ssubl2 v13.4s, v13.8h, v20.8h\n" + "ssubl v18.4s, v12.4h, v20.4h\n" + "ssubl2 v12.4s, v12.8h, v20.8h\n" + "ssubl v17.4s, v10.4h, v20.4h\n" + "ssubl2 v10.4s, v10.8h, v20.8h\n" + "ssubl v16.4s, v9.4h, v20.4h\n" + "ssubl2 v9.4s, v9.8h, v20.8h\n" + "scvtf v8.4s, v1.4s\n" + "scvtf v7.4s, v4.4s\n" + "scvtf v6.4s, v23.4s\n" + "scvtf v5.4s, v3.4s\n" + "scvtf v4.4s, v22.4s\n" + "scvtf v3.4s, v2.4s\n" + "scvtf v2.4s, v21.4s\n" + "scvtf v1.4s, v11.4s\n" + "scvtf v19.4s, v19.4s\n" + "fmul v8.4s, v8.4s, v0.s[0]\n" + "fmla v8.4s, v19.4s, v0.s[1]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v7.4s, v7.4s, v0.s[0]\n" + "fmla v7.4s, v13.4s, v0.s[1]\n" + "scvtf v18.4s, v18.4s\n" + "fmul v6.4s, v6.4s, v0.s[0]\n" + "fmla v6.4s, v18.4s, v0.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v5.4s, v5.4s, v0.s[0]\n" + "fmla v5.4s, v12.4s, v0.s[1]\n" + "scvtf v17.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v0.s[0]\n" + "fmla v4.4s, v17.4s, v0.s[1]\n" + "scvtf v10.4s, v10.4s\n" + "fmul v3.4s, v3.4s, v0.s[0]\n" + "fmla v3.4s, v10.4s, v0.s[1]\n" + "scvtf v16.4s, v16.4s\n" + "fmul v2.4s, v2.4s, v0.s[0]\n" + "fmla v2.4s, v16.4s, v0.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v1.4s, v1.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "cbz %x[out_direct], 3f\n" + "fmul v23.4s, v8.4s, v0.s[3]\n" + "fmul v22.4s, v7.4s, v0.s[3]\n" + "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n" + "fmul v21.4s, v6.4s, v0.s[3]\n" + "fmul v20.4s, v5.4s, v0.s[3]\n" + "fmul v19.4s, v4.4s, v0.s[3]\n" + "fmul v18.4s, v3.4s, v0.s[3]\n" + "fmul v16.4s, v2.4s, v0.s[3]\n" + "fmul v17.4s, v1.4s, v0.s[3]\n" + "fcvtas v23.4s, v23.4s\n" + "fcvtas v22.4s, v22.4s\n" + "fcvtas v21.4s, v21.4s\n" + "fcvtas v20.4s, v20.4s\n" + "fcvtas v19.4s, v19.4s\n" + "fcvtas v18.4s, v18.4s\n" + "fcvtas v16.4s, v16.4s\n" + "fcvtas v17.4s, v17.4s\n" + "uzp1 v22.8h, v23.8h, v22.8h\n" + "uzp1 v20.8h, v21.8h, v20.8h\n" + "uzp1 v18.8h, v19.8h, v18.8h\n" + "uzp1 v17.8h, v16.8h, v17.8h\n" + "dup v16.8h, w20\n" + "add v22.8h, v22.8h, v16.8h\n" + "add v20.8h, v20.8h, v16.8h\n" + "add v18.8h, v18.8h, v16.8h\n" + "add v17.8h, v17.8h, v16.8h\n" + "movi v16.8h, #0xff\n" + "smin v22.8h, v22.8h, v16.8h\n" + "smin v20.8h, v20.8h, v16.8h\n" + "smin v18.8h, v18.8h, v16.8h\n" + "smin v17.8h, v17.8h, v16.8h\n" + "movi v16.8h, #0x0\n" + "smax v22.8h, v22.8h, v16.8h\n" + "smax v20.8h, v20.8h, v16.8h\n" + "smax v18.8h, v18.8h, v16.8h\n" + "smax v17.8h, v17.8h, v16.8h\n" + "xtn v22.8b, v22.8h\n" + "str d22, [x25, #0x0]\n" + "xtn v20.8b, v20.8h\n" + "xtn v18.8b, v18.8h\n" + "str d20, [x25, #0x8]\n" + "xtn v17.8b, v17.8h\n" + "str d18, [x22, #0x0]\n" + "str d17, [x22, #0x8]\n" + "3:" // Main loop: No direct output + "mov v19.16b, v28.16b\n" + "mov v13.16b, v29.16b\n" + "fmla v19.4s, v8.4s, v24.4s\n" + "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n" + "mov v18.16b, v30.16b\n" + "mov v12.16b, v31.16b\n" + "fmla v13.4s, v7.4s, v25.4s\n" + "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n" + "mov v17.16b, v28.16b\n" + "mov v10.16b, v29.16b\n" + "fmla v18.4s, v6.4s, v26.4s\n" + "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n" + "mov v16.16b, v30.16b\n" + "mov v9.16b, v31.16b\n" + "fmla v12.4s, v5.4s, v27.4s\n" + "subs x23, x23, #0x2\n" + "fmla v17.4s, v4.4s, v24.4s\n" + "fmla v10.4s, v3.4s, v25.4s\n" + "fmul v8.4s, v19.4s, v0.s[2]\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "fmla v9.4s, v1.4s, v27.4s\n" + "fmul v7.4s, v13.4s, v0.s[2]\n" + "fmul v6.4s, v18.4s, v0.s[2]\n" + "fmul v5.4s, v12.4s, v0.s[2]\n" + "fmul v4.4s, v17.4s, v0.s[2]\n" + "fmul v3.4s, v10.4s, v0.s[2]\n" + "fmul v2.4s, v16.4s, v0.s[2]\n" + "fmul v1.4s, v9.4s, v0.s[2]\n" + "fcvtas v8.4s, v8.4s\n" + "fcvtas v7.4s, v7.4s\n" + "fcvtas v6.4s, v6.4s\n" + "fcvtas v5.4s, v5.4s\n" + "fcvtas v4.4s, v4.4s\n" + "fcvtas v3.4s, v3.4s\n" + "fcvtas v2.4s, v2.4s\n" + "fcvtas v1.4s, v1.4s\n" + "uzp1 v7.8h, v8.8h, v7.8h\n" + "uzp1 v5.8h, v6.8h, v5.8h\n" + "uzp1 v3.8h, v4.8h, v3.8h\n" + "uzp1 v1.8h, v2.8h, v1.8h\n" + "dup v16.8h, w22\n" + "add v7.8h, v7.8h, v16.8h\n" + "add v5.8h, v5.8h, v16.8h\n" + "add v3.8h, v3.8h, v16.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w21\n" + "smin v7.8h, v7.8h, v16.8h\n" + "smin v5.8h, v5.8h, v16.8h\n" + "smin v3.8h, v3.8h, v16.8h\n" + "smin v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w20\n" + "smax v7.8h, v7.8h, v16.8h\n" + "smax v5.8h, v5.8h, v16.8h\n" + "smax v3.8h, v3.8h, v16.8h\n" + "smax v1.8h, v1.8h, v16.8h\n" + "xtn v7.8b, v7.8h\n" + "str d7, [x26, #0x0]\n" + "xtn v5.8b, v5.8h\n" + "xtn v3.8b, v3.8h\n" + "str d5, [x26, #0x8]\n" + "xtn v1.8b, v1.8h\n" + "str d3, [x24, #0x0]\n" + "str d1, [x24, #0x8]\n" + "bgt 2b\n" + "add %x[in0], %x[in0], #0x10\n" + "add %x[in1], %x[in1], #0x10\n" + "add %x[out], %x[out], #0x10\n" + "cbz %x[out_direct], 4f\n" + "add %x[out_direct], %x[out_direct], #0x10\n" + "4:" // No direct pointer update + "sub %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "bge 1b\n" + "cbz %x[width], 32f\n" + "5:" // main loop skip + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x23, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "6:" // tail loop: Row loop + "mov x28, x12\n" + "mov x27, x11\n" + "mov x26, x10\n" + "mov x25, x9\n" + "add x21, x28, %x[in0_stride]\n" + "add x20, x27, %x[in1_stride]\n" + "add x24, x26, %x[out_stride]\n" + "add x22, x25, %x[out_direct_stride]\n" + "cmp x23, #0x2\n" + "add x12, x21, %x[in0_stride]\n" + "add x11, x20, %x[in1_stride]\n" + "add x10, x24, %x[out_stride]\n" + "add x9, x22, %x[out_direct_stride]\n" + "csel x21, x21, x28, GE\n" + "csel x20, x20, x27, GE\n" + "csel x24, x24, x26, GE\n" + "csel x22, x22, x25, GE\n" + "tbz %x[width], #3, 10f\n" + "ldr d4, [x28, #0x0]\n" + "ldr d13, [x27, #0x0]\n" + "add x28, x28, #0x8\n" + "add x27, x27, #0x8\n" + "ldr d2, [x21, #0x0]\n" + "ldr d10, [x20, #0x0]\n" + "add x21, x21, #0x8\n" + "add x20, x20, #0x8\n" + "tbz %x[width], #2, 8f\n" + "ldr s3, [x28], #0x4\n" + "ldr s12, [x27], #0x4\n" + "ldr s11, [x21], #0x4\n" + "ldr s9, [x20], #0x4\n" + "tbz %x[width], #1, 7f\n" + "ld1 { v3.h }[2], [x28], #0x2\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "ld1 { v11.h }[2], [x21], #0x2\n" + "ld1 { v9.h }[2], [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[6], [x28], #0x1\n" + "ld1 { v12.b }[6], [x27], #0x1\n" + "ld1 { v11.b }[6], [x21], #0x1\n" + "ld1 { v9.b }[6], [x20], #0x1\n" + "b 14f\n" + "7:" // tail loop: unique 1: partial_0_12 + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[4], [x28], #0x1\n" + "ld1 { v12.b }[4], [x27], #0x1\n" + "ld1 { v11.b }[4], [x21], #0x1\n" + "ld1 { v9.b }[4], [x20], #0x1\n" + "b 14f\n" + "8:" // tail loop: unique 1: partial_1_8 + "tbz %x[width], #1, 9f\n" + "ldr h3, [x28], #0x2\n" + "ldr h12, [x27], #0x2\n" + "ldr h11, [x21], #0x2\n" + "ldr h9, [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v12.b }[2], [x27], #0x1\n" + "ld1 { v11.b }[2], [x21], #0x1\n" + "ld1 { v9.b }[2], [x20], #0x1\n" + "b 14f\n" + "9:" // tail loop: unique 1: partial_0_8 + "tbz %x[width], #0, 14f\n" + "ldr b3, [x28], #0x1\n" + "ldr b12, [x27], #0x1\n" + "ldr b11, [x21], #0x1\n" + "ldr b9, [x20], #0x1\n" + "b 14f\n" + "10:" // tail loop: unique 1: partial_2_0 + "tbz %x[width], #2, 12f\n" + "ldr s4, [x28], #0x4\n" + "ldr s13, [x27], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s10, [x20], #0x4\n" + "tbz %x[width], #1, 11f\n" + "ld1 { v4.h }[2], [x28], #0x2\n" + "ld1 { v13.h }[2], [x27], #0x2\n" + "ld1 { v2.h }[2], [x21], #0x2\n" + "ld1 { v10.h }[2], [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[6], [x28], #0x1\n" + "ld1 { v13.b }[6], [x27], #0x1\n" + "ld1 { v2.b }[6], [x21], #0x1\n" + "ld1 { v10.b }[6], [x20], #0x1\n" + "b 14f\n" + "11:" // tail loop: unique 1: partial_0_4 + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[4], [x28], #0x1\n" + "ld1 { v13.b }[4], [x27], #0x1\n" + "ld1 { v2.b }[4], [x21], #0x1\n" + "ld1 { v10.b }[4], [x20], #0x1\n" + "b 14f\n" + "12:" // tail loop: unique 1: partial_1_0 + "tbz %x[width], #1, 13f\n" + "ldr h4, [x28], #0x2\n" + "ldr h13, [x27], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr h10, [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[2], [x28], #0x1\n" + "ld1 { v13.b }[2], [x27], #0x1\n" + "ld1 { v2.b }[2], [x21], #0x1\n" + "ld1 { v10.b }[2], [x20], #0x1\n" + "b 14f\n" + "13:" // tail loop: unique 1: partial_0_0 + "ldr b4, [x28], #0x1\n" + "ldr b13, [x27], #0x1\n" + "ldr b2, [x21], #0x1\n" + "ldr b10, [x20], #0x1\n" + "14:" // tail loop: unique 1: Done + "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n" + "ushll v4.8h, v4.8b, #0x0\n" + "ushll v3.8h, v3.8b, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n" + "ushll v2.8h, v2.8b, #0x0\n" + "ushll v11.8h, v11.8b, #0x0\n" + "dup v16.8h, w21\n" + "ushll v13.8h, v13.8b, #0x0\n" + "ushll v12.8h, v12.8b, #0x0\n" + "ushll v10.8h, v10.8b, #0x0\n" + "ushll v9.8h, v9.8b, #0x0\n" + "ssubl v1.4s, v4.4h, v16.4h\n" + "ssubl2 v4.4s, v4.8h, v16.8h\n" + "ssubl v23.4s, v3.4h, v16.4h\n" + "ssubl2 v3.4s, v3.8h, v16.8h\n" + "ssubl v22.4s, v2.4h, v16.4h\n" + "ssubl2 v2.4s, v2.8h, v16.8h\n" + "ssubl v21.4s, v11.4h, v16.4h\n" + "ssubl2 v11.4s, v11.8h, v16.8h\n" + "dup v20.8h, w20\n" + "ssubl v19.4s, v13.4h, v20.4h\n" + "ssubl2 v13.4s, v13.8h, v20.8h\n" + "ssubl v18.4s, v12.4h, v20.4h\n" + "ssubl2 v12.4s, v12.8h, v20.8h\n" + "ssubl v17.4s, v10.4h, v20.4h\n" + "ssubl2 v10.4s, v10.8h, v20.8h\n" + "ssubl v16.4s, v9.4h, v20.4h\n" + "ssubl2 v9.4s, v9.8h, v20.8h\n" + "scvtf v8.4s, v1.4s\n" + "scvtf v7.4s, v4.4s\n" + "scvtf v6.4s, v23.4s\n" + "scvtf v5.4s, v3.4s\n" + "scvtf v4.4s, v22.4s\n" + "scvtf v3.4s, v2.4s\n" + "scvtf v2.4s, v21.4s\n" + "scvtf v1.4s, v11.4s\n" + "scvtf v19.4s, v19.4s\n" + "fmul v8.4s, v8.4s, v0.s[0]\n" + "fmla v8.4s, v19.4s, v0.s[1]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v7.4s, v7.4s, v0.s[0]\n" + "fmla v7.4s, v13.4s, v0.s[1]\n" + "scvtf v18.4s, v18.4s\n" + "fmul v6.4s, v6.4s, v0.s[0]\n" + "fmla v6.4s, v18.4s, v0.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v5.4s, v5.4s, v0.s[0]\n" + "fmla v5.4s, v12.4s, v0.s[1]\n" + "scvtf v17.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v0.s[0]\n" + "fmla v4.4s, v17.4s, v0.s[1]\n" + "scvtf v10.4s, v10.4s\n" + "fmul v3.4s, v3.4s, v0.s[0]\n" + "fmla v3.4s, v10.4s, v0.s[1]\n" + "scvtf v16.4s, v16.4s\n" + "fmul v2.4s, v2.4s, v0.s[0]\n" + "fmla v2.4s, v16.4s, v0.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v1.4s, v1.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "cbz %x[out_direct], 23f\n" + "fmul v23.4s, v8.4s, v0.s[3]\n" + "fmul v22.4s, v7.4s, v0.s[3]\n" + "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n" + "fmul v21.4s, v6.4s, v0.s[3]\n" + "fmul v20.4s, v5.4s, v0.s[3]\n" + "fmul v19.4s, v4.4s, v0.s[3]\n" + "fmul v18.4s, v3.4s, v0.s[3]\n" + "fmul v16.4s, v2.4s, v0.s[3]\n" + "fmul v17.4s, v1.4s, v0.s[3]\n" + "fcvtas v23.4s, v23.4s\n" + "fcvtas v22.4s, v22.4s\n" + "fcvtas v21.4s, v21.4s\n" + "fcvtas v20.4s, v20.4s\n" + "fcvtas v19.4s, v19.4s\n" + "fcvtas v18.4s, v18.4s\n" + "fcvtas v16.4s, v16.4s\n" + "fcvtas v17.4s, v17.4s\n" + "uzp1 v22.8h, v23.8h, v22.8h\n" + "uzp1 v20.8h, v21.8h, v20.8h\n" + "uzp1 v18.8h, v19.8h, v18.8h\n" + "uzp1 v17.8h, v16.8h, v17.8h\n" + "dup v16.8h, w20\n" + "add v22.8h, v22.8h, v16.8h\n" + "add v20.8h, v20.8h, v16.8h\n" + "add v18.8h, v18.8h, v16.8h\n" + "add v17.8h, v17.8h, v16.8h\n" + "movi v16.8h, #0xff\n" + "smin v22.8h, v22.8h, v16.8h\n" + "smin v20.8h, v20.8h, v16.8h\n" + "smin v18.8h, v18.8h, v16.8h\n" + "smin v17.8h, v17.8h, v16.8h\n" + "movi v16.8h, #0x0\n" + "smax v22.8h, v22.8h, v16.8h\n" + "smax v20.8h, v20.8h, v16.8h\n" + "smax v18.8h, v18.8h, v16.8h\n" + "smax v17.8h, v17.8h, v16.8h\n" + "xtn v22.8b, v22.8h\n" + "xtn v20.8b, v20.8h\n" + "xtn v18.8b, v18.8h\n" + "xtn v17.8b, v17.8h\n" + "tbz %x[width], #3, 18f\n" + "str d22, [x25, #0x0]\n" + "add x25, x25, #0x8\n" + "str d18, [x22, #0x0]\n" + "add x22, x22, #0x8\n" + "tbz %x[width], #2, 16f\n" + "str s20, [x25], #0x4\n" + "str s17, [x22], #0x4\n" + "tbz %x[width], #1, 15f\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v17.h }[2], [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[6], [x25], #0x1\n" + "st1 { v17.b }[6], [x22], #0x1\n" + "b 22f\n" + "15:" // tail loop: Main loop: unique 2: partial_0_12 + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[4], [x25], #0x1\n" + "st1 { v17.b }[4], [x22], #0x1\n" + "b 22f\n" + "16:" // tail loop: Main loop: unique 2: partial_1_8 + "tbz %x[width], #1, 17f\n" + "str h20, [x25], #0x2\n" + "str h17, [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[2], [x25], #0x1\n" + "st1 { v17.b }[2], [x22], #0x1\n" + "b 22f\n" + "17:" // tail loop: Main loop: unique 2: partial_0_8 + "tbz %x[width], #0, 22f\n" + "str b20, [x25], #0x1\n" + "str b17, [x22], #0x1\n" + "b 22f\n" + "18:" // tail loop: Main loop: unique 2: partial_2_0 + "tbz %x[width], #2, 20f\n" + "str s22, [x25], #0x4\n" + "str s18, [x22], #0x4\n" + "tbz %x[width], #1, 19f\n" + "st1 { v22.h }[2], [x25], #0x2\n" + "st1 { v18.h }[2], [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[6], [x25], #0x1\n" + "st1 { v18.b }[6], [x22], #0x1\n" + "b 22f\n" + "19:" // tail loop: Main loop: unique 2: partial_0_4 + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[4], [x25], #0x1\n" + "st1 { v18.b }[4], [x22], #0x1\n" + "b 22f\n" + "20:" // tail loop: Main loop: unique 2: partial_1_0 + "tbz %x[width], #1, 21f\n" + "str h22, [x25], #0x2\n" + "str h18, [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[2], [x25], #0x1\n" + "st1 { v18.b }[2], [x22], #0x1\n" + "b 22f\n" + "21:" // tail loop: Main loop: unique 2: partial_0_0 + "str b22, [x25], #0x1\n" + "str b18, [x22], #0x1\n" + "22:" // tail loop: Main loop: unique 2: Done + "23:" // tail loop: Main loop: No direct output + "mov v19.16b, v28.16b\n" + "mov v13.16b, v29.16b\n" + "fmla v19.4s, v8.4s, v24.4s\n" + "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n" + "mov v18.16b, v30.16b\n" + "mov v12.16b, v31.16b\n" + "fmla v13.4s, v7.4s, v25.4s\n" + "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n" + "mov v17.16b, v28.16b\n" + "mov v10.16b, v29.16b\n" + "fmla v18.4s, v6.4s, v26.4s\n" + "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n" + "mov v16.16b, v30.16b\n" + "mov v9.16b, v31.16b\n" + "fmla v12.4s, v5.4s, v27.4s\n" + "fmla v17.4s, v4.4s, v24.4s\n" + "fmla v10.4s, v3.4s, v25.4s\n" + "fmul v8.4s, v19.4s, v0.s[2]\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "fmla v9.4s, v1.4s, v27.4s\n" + "fmul v7.4s, v13.4s, v0.s[2]\n" + "fmul v6.4s, v18.4s, v0.s[2]\n" + "fmul v5.4s, v12.4s, v0.s[2]\n" + "fmul v4.4s, v17.4s, v0.s[2]\n" + "fmul v3.4s, v10.4s, v0.s[2]\n" + "fmul v2.4s, v16.4s, v0.s[2]\n" + "fmul v1.4s, v9.4s, v0.s[2]\n" + "fcvtas v8.4s, v8.4s\n" + "fcvtas v7.4s, v7.4s\n" + "fcvtas v6.4s, v6.4s\n" + "fcvtas v5.4s, v5.4s\n" + "fcvtas v4.4s, v4.4s\n" + "fcvtas v3.4s, v3.4s\n" + "fcvtas v2.4s, v2.4s\n" + "fcvtas v1.4s, v1.4s\n" + "uzp1 v7.8h, v8.8h, v7.8h\n" + "uzp1 v5.8h, v6.8h, v5.8h\n" + "uzp1 v3.8h, v4.8h, v3.8h\n" + "uzp1 v1.8h, v2.8h, v1.8h\n" + "dup v16.8h, w22\n" + "add v7.8h, v7.8h, v16.8h\n" + "add v5.8h, v5.8h, v16.8h\n" + "add v3.8h, v3.8h, v16.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w21\n" + "smin v7.8h, v7.8h, v16.8h\n" + "smin v5.8h, v5.8h, v16.8h\n" + "smin v3.8h, v3.8h, v16.8h\n" + "smin v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w20\n" + "smax v7.8h, v7.8h, v16.8h\n" + "smax v5.8h, v5.8h, v16.8h\n" + "smax v3.8h, v3.8h, v16.8h\n" + "smax v1.8h, v1.8h, v16.8h\n" + "xtn v7.8b, v7.8h\n" + "xtn v5.8b, v5.8h\n" + "xtn v3.8b, v3.8h\n" + "xtn v1.8b, v1.8h\n" + "tbz %x[width], #3, 27f\n" + "str d7, [x26, #0x0]\n" + "add x26, x26, #0x8\n" + "str d3, [x24, #0x0]\n" + "add x24, x24, #0x8\n" + "tbz %x[width], #2, 25f\n" + "str s5, [x26], #0x4\n" + "str s1, [x24], #0x4\n" + "tbz %x[width], #1, 24f\n" + "st1 { v5.h }[2], [x26], #0x2\n" + "st1 { v1.h }[2], [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[6], [x26], #0x1\n" + "st1 { v1.b }[6], [x24], #0x1\n" + "b 31f\n" + "24:" // tail loop: unique 3: partial_0_12 + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[4], [x26], #0x1\n" + "st1 { v1.b }[4], [x24], #0x1\n" + "b 31f\n" + "25:" // tail loop: unique 3: partial_1_8 + "tbz %x[width], #1, 26f\n" + "str h5, [x26], #0x2\n" + "str h1, [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[2], [x26], #0x1\n" + "st1 { v1.b }[2], [x24], #0x1\n" + "b 31f\n" + "26:" // tail loop: unique 3: partial_0_8 + "tbz %x[width], #0, 31f\n" + "str b5, [x26], #0x1\n" + "str b1, [x24], #0x1\n" + "b 31f\n" + "27:" // tail loop: unique 3: partial_2_0 + "tbz %x[width], #2, 29f\n" + "str s7, [x26], #0x4\n" + "str s3, [x24], #0x4\n" + "tbz %x[width], #1, 28f\n" + "st1 { v7.h }[2], [x26], #0x2\n" + "st1 { v3.h }[2], [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[6], [x26], #0x1\n" + "st1 { v3.b }[6], [x24], #0x1\n" + "b 31f\n" + "28:" // tail loop: unique 3: partial_0_4 + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[4], [x26], #0x1\n" + "st1 { v3.b }[4], [x24], #0x1\n" + "b 31f\n" + "29:" // tail loop: unique 3: partial_1_0 + "tbz %x[width], #1, 30f\n" + "str h7, [x26], #0x2\n" + "str h3, [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[2], [x26], #0x1\n" + "st1 { v3.b }[2], [x24], #0x1\n" + "b 31f\n" + "30:" // tail loop: unique 3: partial_0_0 + "str b7, [x26], #0x1\n" + "str b3, [x24], #0x1\n" + "31:" // tail loop: unique 3: Done + "subs x23, x23, #0x2\n" + "bgt 6b\n" + "32:" // odd columns skip + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} + +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void add_mul_add_u8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + const ITensorInfo *final_output_info = final_output->info(); + const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr; + const ITensorInfo *input1_info = input1->info(); + const ITensorInfo *input2_info = input2->info(); + + const size_t out_stride = final_output_info->strides_in_bytes()[1]; + const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0; + const size_t in0_stride = input1_info->strides_in_bytes()[1]; + const size_t in1_stride = input2_info->strides_in_bytes()[1]; + + uint8_t minval = std::numeric_limits<uint8_t>::lowest(); + uint8_t maxval = std::numeric_limits<uint8_t>::max(); + + const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + minval = quantize_qasymm8(0.f, final_output_qinfo); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + minval = quantize_qasymm8(0.f, final_output_qinfo); + maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + minval = quantize_qasymm8(act_info.b(), final_output_qinfo); + maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); + } + + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + + const int32_t in1_offset = in1_qinfo.offset; + const int32_t in2_offset = in2_qinfo.offset; + const int32_t out_offset = final_output_qinfo.offset; + const int32_t out_direct_offset = add_output_qinfo.offset; + + const float in1_scale = in1_qinfo.scale; + const float in2_scale = in2_qinfo.scale; + const float out_scale = final_output_qinfo.scale; + const float out_direct_scale = add_output_qinfo.scale; + + const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer()); + const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer()); + + // Clear X & Y dimensions on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in1_it(input1, window); + Iterator in2_it(input2, window); + Iterator out_it(final_output, window); + + const size_t width = window.num_iterations(0); + const size_t height = window.num_iterations(1); + + if (add_output != nullptr) + { + Iterator add_out_it(add_output, window); + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, + reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // __aarch64__ diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..e1a45b467b --- /dev/null +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,846 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/QuantizationInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include <cstddef> +#include <cstdint> +#include <limits> + +#ifdef __aarch64__ +namespace +{ +void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out, + size_t out_stride, + int8_t *out_direct, + size_t out_direct_stride, + const int8_t *in0, + size_t in0_stride, + const int8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const int8_t minval, + const int8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) +{ + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; + struct KernelArgs + { + const float *scales; + int32_t in0_zeropt; + int32_t in1_zeropt; + int32_t out_zeropt; + int32_t out_direct_zeropt; + int32_t minval; + int32_t maxval; + } ka; + ka.scales = scales; + ka.in0_zeropt = in0_zeropt; + ka.in1_zeropt = in1_zeropt; + ka.out_zeropt = out_zeropt; + ka.out_direct_zeropt = out_direct_zeropt; + ka.minval = minval; + ka.maxval = maxval; + + __asm__ __volatile__( + "ldr x20, [%x[args_ptr], %[offsetof_scales]]\n" + "ld1 { v0.4s }, [x20]\n" + "cmp %x[width], #0x10\n" + "blt 5f\n" + "1:" // Column loop + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x23, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "2:" // Row loop + "mov x28, x12\n" + "ldr d4, [x28, #0x0]\n" + "ldr d3, [x28, #0x8]\n" + "add x21, x28, %x[in0_stride]\n" + "mov x27, x11\n" + "ldr d13, [x27, #0x0]\n" + "ldr d12, [x27, #0x8]\n" + "cmp x23, #0x2\n" + "add x12, x21, %x[in0_stride]\n" + "csel x21, x21, x28, GE\n" + "ldr d2, [x21, #0x0]\n" + "ldr d11, [x21, #0x8]\n" + "add x20, x27, %x[in1_stride]\n" + "add x11, x20, %x[in1_stride]\n" + "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n" + "sshll v4.8h, v4.8b, #0x0\n" + "csel x20, x20, x27, GE\n" + "ldr d10, [x20, #0x0]\n" + "ldr d9, [x20, #0x8]\n" + "sshll v3.8h, v3.8b, #0x0\n" + "sshll v2.8h, v2.8b, #0x0\n" + "sshll v11.8h, v11.8b, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n" + "mov x26, x10\n" + "dup v16.8h, w21\n" + "sshll v13.8h, v13.8b, #0x0\n" + "mov x25, x9\n" + "add x24, x26, %x[out_stride]\n" + "sshll v12.8h, v12.8b, #0x0\n" + "sshll v10.8h, v10.8b, #0x0\n" + "add x22, x25, %x[out_direct_stride]\n" + "add x10, x24, %x[out_stride]\n" + "sshll v9.8h, v9.8b, #0x0\n" + "ssubl v1.4s, v4.4h, v16.4h\n" + "add x9, x22, %x[out_direct_stride]\n" + "csel x24, x24, x26, GE\n" + "ssubl2 v4.4s, v4.8h, v16.8h\n" + "ssubl v23.4s, v3.4h, v16.4h\n" + "csel x22, x22, x25, GE\n" + "ssubl2 v3.4s, v3.8h, v16.8h\n" + "ssubl v22.4s, v2.4h, v16.4h\n" + "ssubl2 v2.4s, v2.8h, v16.8h\n" + "ssubl v21.4s, v11.4h, v16.4h\n" + "ssubl2 v11.4s, v11.8h, v16.8h\n" + "dup v20.8h, w20\n" + "ssubl v19.4s, v13.4h, v20.4h\n" + "ssubl2 v13.4s, v13.8h, v20.8h\n" + "ssubl v18.4s, v12.4h, v20.4h\n" + "ssubl2 v12.4s, v12.8h, v20.8h\n" + "ssubl v17.4s, v10.4h, v20.4h\n" + "ssubl2 v10.4s, v10.8h, v20.8h\n" + "ssubl v16.4s, v9.4h, v20.4h\n" + "ssubl2 v9.4s, v9.8h, v20.8h\n" + "scvtf v8.4s, v1.4s\n" + "scvtf v7.4s, v4.4s\n" + "scvtf v6.4s, v23.4s\n" + "scvtf v5.4s, v3.4s\n" + "scvtf v4.4s, v22.4s\n" + "scvtf v3.4s, v2.4s\n" + "scvtf v2.4s, v21.4s\n" + "scvtf v1.4s, v11.4s\n" + "scvtf v19.4s, v19.4s\n" + "fmul v8.4s, v8.4s, v0.s[0]\n" + "fmla v8.4s, v19.4s, v0.s[1]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v7.4s, v7.4s, v0.s[0]\n" + "fmla v7.4s, v13.4s, v0.s[1]\n" + "scvtf v18.4s, v18.4s\n" + "fmul v6.4s, v6.4s, v0.s[0]\n" + "fmla v6.4s, v18.4s, v0.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v5.4s, v5.4s, v0.s[0]\n" + "fmla v5.4s, v12.4s, v0.s[1]\n" + "scvtf v17.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v0.s[0]\n" + "fmla v4.4s, v17.4s, v0.s[1]\n" + "scvtf v10.4s, v10.4s\n" + "fmul v3.4s, v3.4s, v0.s[0]\n" + "fmla v3.4s, v10.4s, v0.s[1]\n" + "scvtf v16.4s, v16.4s\n" + "fmul v2.4s, v2.4s, v0.s[0]\n" + "fmla v2.4s, v16.4s, v0.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v1.4s, v1.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "cbz %x[out_direct], 3f\n" + "fmul v23.4s, v8.4s, v0.s[3]\n" + "fmul v22.4s, v7.4s, v0.s[3]\n" + "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n" + "fmul v21.4s, v6.4s, v0.s[3]\n" + "fmul v20.4s, v5.4s, v0.s[3]\n" + "fmul v17.4s, v4.4s, v0.s[3]\n" + "fmul v19.4s, v3.4s, v0.s[3]\n" + "fmul v16.4s, v2.4s, v0.s[3]\n" + "fmul v18.4s, v1.4s, v0.s[3]\n" + "fcvtas v23.4s, v23.4s\n" + "fcvtas v22.4s, v22.4s\n" + "fcvtas v21.4s, v21.4s\n" + "fcvtas v20.4s, v20.4s\n" + "fcvtas v17.4s, v17.4s\n" + "fcvtas v19.4s, v19.4s\n" + "fcvtas v16.4s, v16.4s\n" + "fcvtas v18.4s, v18.4s\n" + "uzp1 v22.8h, v23.8h, v22.8h\n" + "uzp1 v20.8h, v21.8h, v20.8h\n" + "uzp1 v19.8h, v17.8h, v19.8h\n" + "uzp1 v18.8h, v16.8h, v18.8h\n" + "dup v16.8h, w20\n" + "add v22.8h, v22.8h, v16.8h\n" + "add v20.8h, v20.8h, v16.8h\n" + "add v19.8h, v19.8h, v16.8h\n" + "add v18.8h, v18.8h, v16.8h\n" + "movi v17.8h, #0x7f\n" + "mvni v16.8h, #0x7f\n" + "smin v22.8h, v22.8h, v17.8h\n" + "smin v20.8h, v20.8h, v17.8h\n" + "smin v19.8h, v19.8h, v17.8h\n" + "smin v18.8h, v18.8h, v17.8h\n" + "smax v22.8h, v22.8h, v16.8h\n" + "smax v20.8h, v20.8h, v16.8h\n" + "smax v19.8h, v19.8h, v16.8h\n" + "smax v18.8h, v18.8h, v16.8h\n" + "xtn v22.8b, v22.8h\n" + "str d22, [x25, #0x0]\n" + "xtn v20.8b, v20.8h\n" + "xtn v19.8b, v19.8h\n" + "str d20, [x25, #0x8]\n" + "xtn v18.8b, v18.8h\n" + "str d19, [x22, #0x0]\n" + "str d18, [x22, #0x8]\n" + "3:" // Main loop: No direct output + "mov v19.16b, v28.16b\n" + "mov v13.16b, v29.16b\n" + "fmla v19.4s, v8.4s, v24.4s\n" + "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n" + "mov v18.16b, v30.16b\n" + "mov v12.16b, v31.16b\n" + "fmla v13.4s, v7.4s, v25.4s\n" + "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n" + "mov v17.16b, v28.16b\n" + "mov v10.16b, v29.16b\n" + "fmla v18.4s, v6.4s, v26.4s\n" + "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n" + "mov v16.16b, v30.16b\n" + "mov v9.16b, v31.16b\n" + "fmla v12.4s, v5.4s, v27.4s\n" + "subs x23, x23, #0x2\n" + "fmla v17.4s, v4.4s, v24.4s\n" + "fmla v10.4s, v3.4s, v25.4s\n" + "fmul v8.4s, v19.4s, v0.s[2]\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "fmla v9.4s, v1.4s, v27.4s\n" + "fmul v7.4s, v13.4s, v0.s[2]\n" + "fmul v6.4s, v18.4s, v0.s[2]\n" + "fmul v5.4s, v12.4s, v0.s[2]\n" + "fmul v4.4s, v17.4s, v0.s[2]\n" + "fmul v3.4s, v10.4s, v0.s[2]\n" + "fmul v2.4s, v16.4s, v0.s[2]\n" + "fmul v1.4s, v9.4s, v0.s[2]\n" + "fcvtas v8.4s, v8.4s\n" + "fcvtas v7.4s, v7.4s\n" + "fcvtas v6.4s, v6.4s\n" + "fcvtas v5.4s, v5.4s\n" + "fcvtas v4.4s, v4.4s\n" + "fcvtas v3.4s, v3.4s\n" + "fcvtas v2.4s, v2.4s\n" + "fcvtas v1.4s, v1.4s\n" + "uzp1 v7.8h, v8.8h, v7.8h\n" + "uzp1 v5.8h, v6.8h, v5.8h\n" + "uzp1 v3.8h, v4.8h, v3.8h\n" + "uzp1 v1.8h, v2.8h, v1.8h\n" + "dup v16.8h, w22\n" + "add v7.8h, v7.8h, v16.8h\n" + "add v5.8h, v5.8h, v16.8h\n" + "add v3.8h, v3.8h, v16.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w21\n" + "smin v7.8h, v7.8h, v16.8h\n" + "smin v5.8h, v5.8h, v16.8h\n" + "smin v3.8h, v3.8h, v16.8h\n" + "smin v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w20\n" + "smax v7.8h, v7.8h, v16.8h\n" + "smax v5.8h, v5.8h, v16.8h\n" + "smax v3.8h, v3.8h, v16.8h\n" + "smax v1.8h, v1.8h, v16.8h\n" + "xtn v7.8b, v7.8h\n" + "str d7, [x26, #0x0]\n" + "xtn v5.8b, v5.8h\n" + "xtn v3.8b, v3.8h\n" + "str d5, [x26, #0x8]\n" + "xtn v1.8b, v1.8h\n" + "str d3, [x24, #0x0]\n" + "str d1, [x24, #0x8]\n" + "bgt 2b\n" + "add %x[in0], %x[in0], #0x10\n" + "add %x[in1], %x[in1], #0x10\n" + "add %x[out], %x[out], #0x10\n" + "cbz %x[out_direct], 4f\n" + "add %x[out_direct], %x[out_direct], #0x10\n" + "4:" // No direct pointer update + "sub %x[width], %x[width], #0x10\n" + "cmp %x[width], #0x10\n" + "bge 1b\n" + "cbz %x[width], 32f\n" + "5:" // main loop skip + "ldr q24, [%x[bn_mul], #0x0]\n" + "ldr q25, [%x[bn_mul], #0x10]\n" + "mov x23, %x[height]\n" + "mov x12, %x[in0]\n" + "ldr q26, [%x[bn_mul], #0x20]\n" + "ldr q27, [%x[bn_mul], #0x30]\n" + "mov x11, %x[in1]\n" + "mov x10, %x[out]\n" + "ldr q28, [%x[bn_add], #0x0]\n" + "ldr q29, [%x[bn_add], #0x10]\n" + "mov x9, %x[out_direct]\n" + "add %x[bn_mul], %x[bn_mul], #0x40\n" + "ldr q30, [%x[bn_add], #0x20]\n" + "ldr q31, [%x[bn_add], #0x30]\n" + "add %x[bn_add], %x[bn_add], #0x40\n" + "6:" // tail loop: Row loop + "mov x28, x12\n" + "mov x27, x11\n" + "mov x26, x10\n" + "mov x25, x9\n" + "add x21, x28, %x[in0_stride]\n" + "add x20, x27, %x[in1_stride]\n" + "add x24, x26, %x[out_stride]\n" + "add x22, x25, %x[out_direct_stride]\n" + "cmp x23, #0x2\n" + "add x12, x21, %x[in0_stride]\n" + "add x11, x20, %x[in1_stride]\n" + "add x10, x24, %x[out_stride]\n" + "add x9, x22, %x[out_direct_stride]\n" + "csel x21, x21, x28, GE\n" + "csel x20, x20, x27, GE\n" + "csel x24, x24, x26, GE\n" + "csel x22, x22, x25, GE\n" + "tbz %x[width], #3, 10f\n" + "ldr d4, [x28, #0x0]\n" + "ldr d13, [x27, #0x0]\n" + "add x28, x28, #0x8\n" + "add x27, x27, #0x8\n" + "ldr d2, [x21, #0x0]\n" + "ldr d10, [x20, #0x0]\n" + "add x21, x21, #0x8\n" + "add x20, x20, #0x8\n" + "tbz %x[width], #2, 8f\n" + "ldr s3, [x28], #0x4\n" + "ldr s12, [x27], #0x4\n" + "ldr s11, [x21], #0x4\n" + "ldr s9, [x20], #0x4\n" + "tbz %x[width], #1, 7f\n" + "ld1 { v3.h }[2], [x28], #0x2\n" + "ld1 { v12.h }[2], [x27], #0x2\n" + "ld1 { v11.h }[2], [x21], #0x2\n" + "ld1 { v9.h }[2], [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[6], [x28], #0x1\n" + "ld1 { v12.b }[6], [x27], #0x1\n" + "ld1 { v11.b }[6], [x21], #0x1\n" + "ld1 { v9.b }[6], [x20], #0x1\n" + "b 14f\n" + "7:" // tail loop: unique 1: partial_0_12 + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[4], [x28], #0x1\n" + "ld1 { v12.b }[4], [x27], #0x1\n" + "ld1 { v11.b }[4], [x21], #0x1\n" + "ld1 { v9.b }[4], [x20], #0x1\n" + "b 14f\n" + "8:" // tail loop: unique 1: partial_1_8 + "tbz %x[width], #1, 9f\n" + "ldr h3, [x28], #0x2\n" + "ldr h12, [x27], #0x2\n" + "ldr h11, [x21], #0x2\n" + "ldr h9, [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v3.b }[2], [x28], #0x1\n" + "ld1 { v12.b }[2], [x27], #0x1\n" + "ld1 { v11.b }[2], [x21], #0x1\n" + "ld1 { v9.b }[2], [x20], #0x1\n" + "b 14f\n" + "9:" // tail loop: unique 1: partial_0_8 + "tbz %x[width], #0, 14f\n" + "ldr b3, [x28], #0x1\n" + "ldr b12, [x27], #0x1\n" + "ldr b11, [x21], #0x1\n" + "ldr b9, [x20], #0x1\n" + "b 14f\n" + "10:" // tail loop: unique 1: partial_2_0 + "tbz %x[width], #2, 12f\n" + "ldr s4, [x28], #0x4\n" + "ldr s13, [x27], #0x4\n" + "ldr s2, [x21], #0x4\n" + "ldr s10, [x20], #0x4\n" + "tbz %x[width], #1, 11f\n" + "ld1 { v4.h }[2], [x28], #0x2\n" + "ld1 { v13.h }[2], [x27], #0x2\n" + "ld1 { v2.h }[2], [x21], #0x2\n" + "ld1 { v10.h }[2], [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[6], [x28], #0x1\n" + "ld1 { v13.b }[6], [x27], #0x1\n" + "ld1 { v2.b }[6], [x21], #0x1\n" + "ld1 { v10.b }[6], [x20], #0x1\n" + "b 14f\n" + "11:" // tail loop: unique 1: partial_0_4 + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[4], [x28], #0x1\n" + "ld1 { v13.b }[4], [x27], #0x1\n" + "ld1 { v2.b }[4], [x21], #0x1\n" + "ld1 { v10.b }[4], [x20], #0x1\n" + "b 14f\n" + "12:" // tail loop: unique 1: partial_1_0 + "tbz %x[width], #1, 13f\n" + "ldr h4, [x28], #0x2\n" + "ldr h13, [x27], #0x2\n" + "ldr h2, [x21], #0x2\n" + "ldr h10, [x20], #0x2\n" + "tbz %x[width], #0, 14f\n" + "ld1 { v4.b }[2], [x28], #0x1\n" + "ld1 { v13.b }[2], [x27], #0x1\n" + "ld1 { v2.b }[2], [x21], #0x1\n" + "ld1 { v10.b }[2], [x20], #0x1\n" + "b 14f\n" + "13:" // tail loop: unique 1: partial_0_0 + "ldr b4, [x28], #0x1\n" + "ldr b13, [x27], #0x1\n" + "ldr b2, [x21], #0x1\n" + "ldr b10, [x20], #0x1\n" + "14:" // tail loop: unique 1: Done + "ldr w21, [%x[args_ptr], %[offsetof_in0_zeropt]]\n" + "sshll v4.8h, v4.8b, #0x0\n" + "sshll v3.8h, v3.8b, #0x0\n" + "ldr w20, [%x[args_ptr], %[offsetof_in1_zeropt]]\n" + "sshll v2.8h, v2.8b, #0x0\n" + "sshll v11.8h, v11.8b, #0x0\n" + "dup v16.8h, w21\n" + "sshll v13.8h, v13.8b, #0x0\n" + "sshll v12.8h, v12.8b, #0x0\n" + "sshll v10.8h, v10.8b, #0x0\n" + "sshll v9.8h, v9.8b, #0x0\n" + "ssubl v1.4s, v4.4h, v16.4h\n" + "ssubl2 v4.4s, v4.8h, v16.8h\n" + "ssubl v23.4s, v3.4h, v16.4h\n" + "ssubl2 v3.4s, v3.8h, v16.8h\n" + "ssubl v22.4s, v2.4h, v16.4h\n" + "ssubl2 v2.4s, v2.8h, v16.8h\n" + "ssubl v21.4s, v11.4h, v16.4h\n" + "ssubl2 v11.4s, v11.8h, v16.8h\n" + "dup v20.8h, w20\n" + "ssubl v19.4s, v13.4h, v20.4h\n" + "ssubl2 v13.4s, v13.8h, v20.8h\n" + "ssubl v18.4s, v12.4h, v20.4h\n" + "ssubl2 v12.4s, v12.8h, v20.8h\n" + "ssubl v17.4s, v10.4h, v20.4h\n" + "ssubl2 v10.4s, v10.8h, v20.8h\n" + "ssubl v16.4s, v9.4h, v20.4h\n" + "ssubl2 v9.4s, v9.8h, v20.8h\n" + "scvtf v8.4s, v1.4s\n" + "scvtf v7.4s, v4.4s\n" + "scvtf v6.4s, v23.4s\n" + "scvtf v5.4s, v3.4s\n" + "scvtf v4.4s, v22.4s\n" + "scvtf v3.4s, v2.4s\n" + "scvtf v2.4s, v21.4s\n" + "scvtf v1.4s, v11.4s\n" + "scvtf v19.4s, v19.4s\n" + "fmul v8.4s, v8.4s, v0.s[0]\n" + "fmla v8.4s, v19.4s, v0.s[1]\n" + "scvtf v13.4s, v13.4s\n" + "fmul v7.4s, v7.4s, v0.s[0]\n" + "fmla v7.4s, v13.4s, v0.s[1]\n" + "scvtf v18.4s, v18.4s\n" + "fmul v6.4s, v6.4s, v0.s[0]\n" + "fmla v6.4s, v18.4s, v0.s[1]\n" + "scvtf v12.4s, v12.4s\n" + "fmul v5.4s, v5.4s, v0.s[0]\n" + "fmla v5.4s, v12.4s, v0.s[1]\n" + "scvtf v17.4s, v17.4s\n" + "fmul v4.4s, v4.4s, v0.s[0]\n" + "fmla v4.4s, v17.4s, v0.s[1]\n" + "scvtf v10.4s, v10.4s\n" + "fmul v3.4s, v3.4s, v0.s[0]\n" + "fmla v3.4s, v10.4s, v0.s[1]\n" + "scvtf v16.4s, v16.4s\n" + "fmul v2.4s, v2.4s, v0.s[0]\n" + "fmla v2.4s, v16.4s, v0.s[1]\n" + "scvtf v9.4s, v9.4s\n" + "fmul v1.4s, v1.4s, v0.s[0]\n" + "fmla v1.4s, v9.4s, v0.s[1]\n" + "cbz %x[out_direct], 23f\n" + "fmul v23.4s, v8.4s, v0.s[3]\n" + "fmul v22.4s, v7.4s, v0.s[3]\n" + "ldr w20, [%x[args_ptr], %[offsetof_out_direct_zeropt]]\n" + "fmul v21.4s, v6.4s, v0.s[3]\n" + "fmul v20.4s, v5.4s, v0.s[3]\n" + "fmul v17.4s, v4.4s, v0.s[3]\n" + "fmul v19.4s, v3.4s, v0.s[3]\n" + "fmul v16.4s, v2.4s, v0.s[3]\n" + "fmul v18.4s, v1.4s, v0.s[3]\n" + "fcvtas v23.4s, v23.4s\n" + "fcvtas v22.4s, v22.4s\n" + "fcvtas v21.4s, v21.4s\n" + "fcvtas v20.4s, v20.4s\n" + "fcvtas v17.4s, v17.4s\n" + "fcvtas v19.4s, v19.4s\n" + "fcvtas v16.4s, v16.4s\n" + "fcvtas v18.4s, v18.4s\n" + "uzp1 v22.8h, v23.8h, v22.8h\n" + "uzp1 v20.8h, v21.8h, v20.8h\n" + "uzp1 v19.8h, v17.8h, v19.8h\n" + "uzp1 v18.8h, v16.8h, v18.8h\n" + "dup v16.8h, w20\n" + "add v22.8h, v22.8h, v16.8h\n" + "add v20.8h, v20.8h, v16.8h\n" + "add v19.8h, v19.8h, v16.8h\n" + "add v18.8h, v18.8h, v16.8h\n" + "movi v17.8h, #0x7f\n" + "mvni v16.8h, #0x7f\n" + "smin v22.8h, v22.8h, v17.8h\n" + "smin v20.8h, v20.8h, v17.8h\n" + "smin v19.8h, v19.8h, v17.8h\n" + "smin v18.8h, v18.8h, v17.8h\n" + "smax v22.8h, v22.8h, v16.8h\n" + "smax v20.8h, v20.8h, v16.8h\n" + "smax v19.8h, v19.8h, v16.8h\n" + "smax v18.8h, v18.8h, v16.8h\n" + "xtn v22.8b, v22.8h\n" + "xtn v20.8b, v20.8h\n" + "xtn v19.8b, v19.8h\n" + "xtn v18.8b, v18.8h\n" + "tbz %x[width], #3, 18f\n" + "str d22, [x25, #0x0]\n" + "add x25, x25, #0x8\n" + "str d19, [x22, #0x0]\n" + "add x22, x22, #0x8\n" + "tbz %x[width], #2, 16f\n" + "str s20, [x25], #0x4\n" + "str s18, [x22], #0x4\n" + "tbz %x[width], #1, 15f\n" + "st1 { v20.h }[2], [x25], #0x2\n" + "st1 { v18.h }[2], [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[6], [x25], #0x1\n" + "st1 { v18.b }[6], [x22], #0x1\n" + "b 22f\n" + "15:" // tail loop: Main loop: unique 2: partial_0_12 + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[4], [x25], #0x1\n" + "st1 { v18.b }[4], [x22], #0x1\n" + "b 22f\n" + "16:" // tail loop: Main loop: unique 2: partial_1_8 + "tbz %x[width], #1, 17f\n" + "str h20, [x25], #0x2\n" + "str h18, [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v20.b }[2], [x25], #0x1\n" + "st1 { v18.b }[2], [x22], #0x1\n" + "b 22f\n" + "17:" // tail loop: Main loop: unique 2: partial_0_8 + "tbz %x[width], #0, 22f\n" + "str b20, [x25], #0x1\n" + "str b18, [x22], #0x1\n" + "b 22f\n" + "18:" // tail loop: Main loop: unique 2: partial_2_0 + "tbz %x[width], #2, 20f\n" + "str s22, [x25], #0x4\n" + "str s19, [x22], #0x4\n" + "tbz %x[width], #1, 19f\n" + "st1 { v22.h }[2], [x25], #0x2\n" + "st1 { v19.h }[2], [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[6], [x25], #0x1\n" + "st1 { v19.b }[6], [x22], #0x1\n" + "b 22f\n" + "19:" // tail loop: Main loop: unique 2: partial_0_4 + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[4], [x25], #0x1\n" + "st1 { v19.b }[4], [x22], #0x1\n" + "b 22f\n" + "20:" // tail loop: Main loop: unique 2: partial_1_0 + "tbz %x[width], #1, 21f\n" + "str h22, [x25], #0x2\n" + "str h19, [x22], #0x2\n" + "tbz %x[width], #0, 22f\n" + "st1 { v22.b }[2], [x25], #0x1\n" + "st1 { v19.b }[2], [x22], #0x1\n" + "b 22f\n" + "21:" // tail loop: Main loop: unique 2: partial_0_0 + "str b22, [x25], #0x1\n" + "str b19, [x22], #0x1\n" + "22:" // tail loop: Main loop: unique 2: Done + "23:" // tail loop: Main loop: No direct output + "mov v19.16b, v28.16b\n" + "mov v13.16b, v29.16b\n" + "fmla v19.4s, v8.4s, v24.4s\n" + "ldr w22, [%x[args_ptr], %[offsetof_out_zeropt]]\n" + "mov v18.16b, v30.16b\n" + "mov v12.16b, v31.16b\n" + "fmla v13.4s, v7.4s, v25.4s\n" + "ldr w21, [%x[args_ptr], %[offsetof_maxval]]\n" + "mov v17.16b, v28.16b\n" + "mov v10.16b, v29.16b\n" + "fmla v18.4s, v6.4s, v26.4s\n" + "ldr w20, [%x[args_ptr], %[offsetof_minval]]\n" + "mov v16.16b, v30.16b\n" + "mov v9.16b, v31.16b\n" + "fmla v12.4s, v5.4s, v27.4s\n" + "fmla v17.4s, v4.4s, v24.4s\n" + "fmla v10.4s, v3.4s, v25.4s\n" + "fmul v8.4s, v19.4s, v0.s[2]\n" + "fmla v16.4s, v2.4s, v26.4s\n" + "fmla v9.4s, v1.4s, v27.4s\n" + "fmul v7.4s, v13.4s, v0.s[2]\n" + "fmul v6.4s, v18.4s, v0.s[2]\n" + "fmul v5.4s, v12.4s, v0.s[2]\n" + "fmul v4.4s, v17.4s, v0.s[2]\n" + "fmul v3.4s, v10.4s, v0.s[2]\n" + "fmul v2.4s, v16.4s, v0.s[2]\n" + "fmul v1.4s, v9.4s, v0.s[2]\n" + "fcvtas v8.4s, v8.4s\n" + "fcvtas v7.4s, v7.4s\n" + "fcvtas v6.4s, v6.4s\n" + "fcvtas v5.4s, v5.4s\n" + "fcvtas v4.4s, v4.4s\n" + "fcvtas v3.4s, v3.4s\n" + "fcvtas v2.4s, v2.4s\n" + "fcvtas v1.4s, v1.4s\n" + "uzp1 v7.8h, v8.8h, v7.8h\n" + "uzp1 v5.8h, v6.8h, v5.8h\n" + "uzp1 v3.8h, v4.8h, v3.8h\n" + "uzp1 v1.8h, v2.8h, v1.8h\n" + "dup v16.8h, w22\n" + "add v7.8h, v7.8h, v16.8h\n" + "add v5.8h, v5.8h, v16.8h\n" + "add v3.8h, v3.8h, v16.8h\n" + "add v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w21\n" + "smin v7.8h, v7.8h, v16.8h\n" + "smin v5.8h, v5.8h, v16.8h\n" + "smin v3.8h, v3.8h, v16.8h\n" + "smin v1.8h, v1.8h, v16.8h\n" + "dup v16.8h, w20\n" + "smax v7.8h, v7.8h, v16.8h\n" + "smax v5.8h, v5.8h, v16.8h\n" + "smax v3.8h, v3.8h, v16.8h\n" + "smax v1.8h, v1.8h, v16.8h\n" + "xtn v7.8b, v7.8h\n" + "xtn v5.8b, v5.8h\n" + "xtn v3.8b, v3.8h\n" + "xtn v1.8b, v1.8h\n" + "tbz %x[width], #3, 27f\n" + "str d7, [x26, #0x0]\n" + "add x26, x26, #0x8\n" + "str d3, [x24, #0x0]\n" + "add x24, x24, #0x8\n" + "tbz %x[width], #2, 25f\n" + "str s5, [x26], #0x4\n" + "str s1, [x24], #0x4\n" + "tbz %x[width], #1, 24f\n" + "st1 { v5.h }[2], [x26], #0x2\n" + "st1 { v1.h }[2], [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[6], [x26], #0x1\n" + "st1 { v1.b }[6], [x24], #0x1\n" + "b 31f\n" + "24:" // tail loop: unique 3: partial_0_12 + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[4], [x26], #0x1\n" + "st1 { v1.b }[4], [x24], #0x1\n" + "b 31f\n" + "25:" // tail loop: unique 3: partial_1_8 + "tbz %x[width], #1, 26f\n" + "str h5, [x26], #0x2\n" + "str h1, [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v5.b }[2], [x26], #0x1\n" + "st1 { v1.b }[2], [x24], #0x1\n" + "b 31f\n" + "26:" // tail loop: unique 3: partial_0_8 + "tbz %x[width], #0, 31f\n" + "str b5, [x26], #0x1\n" + "str b1, [x24], #0x1\n" + "b 31f\n" + "27:" // tail loop: unique 3: partial_2_0 + "tbz %x[width], #2, 29f\n" + "str s7, [x26], #0x4\n" + "str s3, [x24], #0x4\n" + "tbz %x[width], #1, 28f\n" + "st1 { v7.h }[2], [x26], #0x2\n" + "st1 { v3.h }[2], [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[6], [x26], #0x1\n" + "st1 { v3.b }[6], [x24], #0x1\n" + "b 31f\n" + "28:" // tail loop: unique 3: partial_0_4 + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[4], [x26], #0x1\n" + "st1 { v3.b }[4], [x24], #0x1\n" + "b 31f\n" + "29:" // tail loop: unique 3: partial_1_0 + "tbz %x[width], #1, 30f\n" + "str h7, [x26], #0x2\n" + "str h3, [x24], #0x2\n" + "tbz %x[width], #0, 31f\n" + "st1 { v7.b }[2], [x26], #0x1\n" + "st1 { v3.b }[2], [x24], #0x1\n" + "b 31f\n" + "30:" // tail loop: unique 3: partial_0_0 + "str b7, [x26], #0x1\n" + "str b3, [x24], #0x1\n" + "31:" // tail loop: unique 3: Done + "subs x23, x23, #0x2\n" + "bgt 6b\n" + "32:" // odd columns skip + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); +} + +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void add_mul_add_s8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + const ITensorInfo *final_output_info = final_output->info(); + const ITensorInfo *add_output_info = (add_output != nullptr) ? add_output->info() : nullptr; + const ITensorInfo *input1_info = input1->info(); + const ITensorInfo *input2_info = input2->info(); + + const size_t out_stride = final_output_info->strides_in_bytes()[1]; + const size_t out_direct_stride = (add_output != nullptr) ? add_output_info->strides_in_bytes()[1] : 0; + const size_t in0_stride = input1_info->strides_in_bytes()[1]; + const size_t in1_stride = input2_info->strides_in_bytes()[1]; + + int8_t minval = std::numeric_limits<int8_t>::lowest(); + int8_t maxval = std::numeric_limits<int8_t>::max(); + + const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + minval = quantize_qasymm8_signed(0.f, final_output_qinfo); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + minval = quantize_qasymm8_signed(0.f, final_output_qinfo); + maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo); + maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); + } + + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + + const int32_t in1_offset = in1_qinfo.offset; + const int32_t in2_offset = in2_qinfo.offset; + const int32_t out_offset = final_output_qinfo.offset; + const int32_t out_direct_offset = add_output_qinfo.offset; + + const float in1_scale = in1_qinfo.scale; + const float in2_scale = in2_qinfo.scale; + const float out_scale = final_output_qinfo.scale; + const float out_direct_scale = add_output_qinfo.scale; + + const float *bn_mul_buffer = reinterpret_cast<float *>(bn_mul->buffer()); + const float *bn_add_buffer = reinterpret_cast<float *>(bn_add->buffer()); + + // Clear X & Y dimensions on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in1_it(input1, window); + Iterator in2_it(input2, window); + Iterator out_it(final_output, window); + + const size_t width = window.num_iterations(0); + const size_t height = window.num_iterations(1); + + if (add_output != nullptr) + { + Iterator add_out_it(add_output, window); + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, + out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset, + in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); + } + else + { + execute_window_loop( + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // __aarch64__ diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h new file mode 100644 index 0000000000..568003a916 --- /dev/null +++ b/src/cpu/kernels/addmuladd/list.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_KERNELS_ADDMULADD_LIST +#define SRC_CPU_KERNELS_ADDMULADD_LIST + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ + void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \ + ITensor *add_output, ITensor *final_output, ConvertPolicy policy, \ + const ActivationLayerInfo &act_info, const Window &window) + +DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon); +DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon); +DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_u8_neon); +DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_s8_neon); + +#undef DECLARE_ADD_MUL_ADD_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CPU_KERNELS_ADDMULADD_LIST */ diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h new file mode 100644 index 0000000000..6e8f32ef47 --- /dev/null +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H +#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/NEON/INEKernel.h" +#include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" + +#include "gemm_common.hpp" + +namespace arm_compute +{ +class ITensor; + +namespace cpu +{ +namespace kernel +{ +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55. + * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance + * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel + * in the context of an NEFunctions. + * + * The type T is the type of the actual kernel implemented in assembly which is of type + * template<typename To, typename Tr> class GemmCommon + * + * + */ +template <typename TypeInput, typename TypeOutput> +class CpuGemmAssemblyWrapperKernel final : public INEKernel +{ +public: + /** Constructor + */ + CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") + { + } + + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; + CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; + + const char *name() const override + { + return _name.c_str(); + } + + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + auto win = arm_gemm::to_ndcoord(window); + + arm_gemm::ndcoord_t thread_locator{}; + + _kernel->execute(win, thread_locator, info.thread_id); + } + + // Inherited methods overridden: + void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + //convert between arm_compute and arm_gemm types + auto ndc_win = arm_gemm::to_ndcoord(window); + auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator); + + _kernel->execute(ndc_win, ndc_tlc, info.thread_id); + } + + /** Initialise the kernel's input and output. + * + * @param[in] kernel Pointer to an assembly kernel implementation. + * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name. + */ + void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag) + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel))); + _kernel = kernel; + + Window win = to_window(kernel->get_window_size()); + + INEKernel::configure(win); + + if (!kernel_name_tag.empty()) + { + _name += "/" + kernel_name_tag; + } + } + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requested configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override + { + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; + } + +private: + arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel; + std::string _name; +}; +} // namespace kernel +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp new file mode 100644 index 0000000000..941fed0ba8 --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2018-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP +#define ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP + +#pragma once + +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" +#include <cstring> +#include <memory> +#include <vector> + +namespace arm_gemm +{ +enum class GemmMethod +{ + DEFAULT, + GEMV_BATCHED, + GEMV_PRETRANSPOSED, + GEMV_NATIVE_TRANSPOSED, + GEMM_NATIVE, + GEMM_HYBRID, + GEMM_INTERLEAVED, + GEMM_INTERLEAVED_2D, + QUANTIZE_WRAPPER, + QUANTIZE_WRAPPER_2D, + GEMM_HYBRID_QUANTIZED +}; + +enum class WeightFormat +{ + UNSPECIFIED = 0x1, + ANY = 0x2, + OHWI = 0x100100, + OHWIo2 = 0x100200, + OHWIo4 = 0x100400, + OHWIo8 = 0x100800, + OHWIo16 = 0x101000, + OHWIo32 = 0x102000, + OHWIo64 = 0x104000, + OHWIo128 = 0x108000, + OHWIo4i2 = 0x200400, + OHWIo4i2_bf16 = 0x200410, + OHWIo8i2 = 0x200800, + OHWIo8i2_bf16 = 0x200810, + OHWIo16i2 = 0x201000, + OHWIo16i2_bf16 = 0x201010, + OHWIo32i2 = 0x202000, + OHWIo32i2_bf16 = 0x202010, + OHWIo64i2 = 0x204000, + OHWIo64i2_bf16 = 0x204010, + OHWIo4i4 = 0x400400, + OHWIo4i4_bf16 = 0x400410, + OHWIo8i4 = 0x400800, + OHWIo8i4_bf16 = 0x400810, + OHWIo16i4 = 0x401000, + OHWIo16i4_bf16 = 0x401010, + OHWIo32i4 = 0x402000, + OHWIo32i4_bf16 = 0x402010, + OHWIo64i4 = 0x404000, + OHWIo64i4_bf16 = 0x404010, + OHWIo2i8 = 0x800200, + OHWIo4i8 = 0x800400, + OHWIo8i8 = 0x800800, + OHWIo16i8 = 0x801000, + OHWIo32i8 = 0x802000, + OHWIo64i8 = 0x804000 +}; + +struct KernelDescription +{ + GemmMethod method = GemmMethod::DEFAULT; + std::string name = ""; + bool is_default = false; + uint64_t cycle_estimate = 0; + + KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0) + : method(m), name(n), is_default(d), cycle_estimate(c) + { + } + KernelDescription() noexcept + { + } +}; + +struct GemmConfig +{ + GemmMethod method = GemmMethod::DEFAULT; + std::string filter = ""; + unsigned int inner_block_size = 0; + unsigned int outer_block_size = 0; + WeightFormat weight_format = WeightFormat::ANY; + + GemmConfig(GemmMethod method) : method(method) + { + } + GemmConfig() + { + } +}; + +struct Activation +{ + enum class Type + { + None, + ReLU, + BoundedReLU + }; + + Type type; + float param1; + float param2; + + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2) + { + } +}; + +struct GemmArgs +{ +public: + const CPUInfo *_ci; + unsigned int _Msize; // num of tiles + unsigned int _Nsize; // output channels + unsigned int _Ksize; // input channels + unsigned int _Ksections; + unsigned int _nbatches; + unsigned int _nmulti; // n_gemms to be performed + bool _indirect_input; + Activation _act; + int _maxthreads; + bool _fixed_format; + bool _fast_mode; + bool _accumulate; + const GemmConfig *_cfg; + + GemmArgs(const CPUInfo *ci, + unsigned int M, + unsigned int N, + unsigned int K, + unsigned int Ksections, + unsigned int nbatches, + unsigned int nmulti, + bool indirect_input, + Activation act, + const int maxthreads, + bool fixed_format = false, + bool fast_mode = false, + bool accumulate = false, + const GemmConfig *cfg = nullptr) + : _ci(ci), + _Msize(M), + _Nsize(N), + _Ksize(K), + _Ksections(Ksections), + _nbatches(nbatches), + _nmulti(nmulti), + _indirect_input(indirect_input), + _act(act), + _maxthreads(maxthreads), + _fixed_format(fixed_format), + _fast_mode(fast_mode), + _accumulate(accumulate), + _cfg(cfg) + { + } +}; + +struct Requantize32 +{ +public: + const int32_t *bias = nullptr; + size_t bias_multi_stride = 0; + int32_t a_offset = 0; + int32_t b_offset = 0; + int32_t c_offset = 0; + bool per_channel_requant = false; + int32_t per_layer_left_shift = 0; + int32_t per_layer_right_shift = 0; + int32_t per_layer_mul = 0; + const int32_t *per_channel_left_shifts = nullptr; + const int32_t *per_channel_right_shifts = nullptr; + const int32_t *per_channel_muls = nullptr; + int32_t minval = 0; + int32_t maxval = 0; + + Requantize32() = default; + + // Constructor for per-tensor quantization + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, + int32_t requant_shift, + int32_t requant_mul, + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(false), + per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), + per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), + per_layer_mul(requant_mul), + minval(minv), + maxval(maxv) + { + } + + // Constructor for per-channel quantization + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, + const int32_t *requant_left_shifts, + const int32_t *requant_right_shifts, + const int32_t *requant_muls, + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(true), + per_channel_left_shifts(requant_left_shifts), + per_channel_right_shifts(requant_right_shifts), + per_channel_muls(requant_muls), + minval(minv), + maxval(maxv) + { + } +}; + +struct DequantizeFloat +{ +public: + float scale = 0; + + DequantizeFloat() = default; + + // Constructor + DequantizeFloat(const float scale) : scale(scale) + { + } +}; + +struct Nothing +{ +}; + +template <typename Top, typename Tret> +using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>; + +/* Low level API calls. + * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ + +/* get_gemm_method(): Given the templated types and provided parameters, + * which is the preferred method to implement this GEMM? */ +template <typename Top, typename Tret, class OutputStage = Nothing> +KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); + +template <typename Top, typename Tret, class OutputStage = Nothing> +UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {}); + +template <typename Top, typename Tret, class OutputStage = Nothing> +std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); + +template <typename Top, typename Tret, class OutputStage = Nothing> +bool has_opt_gemm(WeightFormat &weight_format, const GemmArgs &args, const OutputStage & = {}); + +} // namespace arm_gemm + +#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_ARM_GEMM_HPP diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp new file mode 100644 index 0000000000..0672e899b6 --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_compute/core/Dimensions.h" +#include "arm_compute/core/Window.h" + +#include "ndrange.hpp" +#include <cassert> + +/* This file contains mapping between integral types used in arm_compute and arm_gemm + * These two codebases both require a degree of separation for the sake of modularity + * so maintain their own types which represent similar information. + */ + +namespace arm_gemm +{ +//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library +constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions; + +using ndrange_t = NDRange<ndrange_max>; +using ndcoord_t = NDCoordinate<ndrange_max>; + +/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window` + * + * As `NDRange<T>` does not not encode start positions, we specify + * the start to be zero in the produced `arm_compute::Window` + * + * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr` + */ +inline arm_compute::Window to_window(const ndrange_t &ndr) +{ + arm_compute::Window win; + + for (unsigned int i = 0; i != ndrange_max; ++i) + { + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); + } + + return win; +} + +/* + * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window` + * + * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc` + */ +inline arm_compute::Window to_window(const ndcoord_t &ndc) +{ + arm_compute::Window win; + + for (unsigned int i = 0; i != ndrange_max; ++i) + { + const auto start = ndc.get_position(i); + const auto size = ndc.get_size(i); + const auto stop = start + size; + + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(start, stop)); + } + + return win; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions + * + * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()` + * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t` + * @return the resultant ndrange_t + */ +inline ndrange_t to_ndrange(const arm_compute::Window &win) +{ + return {static_cast<unsigned int>(win[0].end() - win[0].start()), + static_cast<unsigned int>(win[1].end() - win[1].start()), + static_cast<unsigned int>(win[2].end() - win[2].start()), + static_cast<unsigned int>(win[3].end() - win[3].start()), + static_cast<unsigned int>(win[4].end() - win[4].start()), + static_cast<unsigned int>(win[5].end() - win[5].start())}; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t` + * @return the resultant ndcoord_t + */ +inline ndcoord_t to_ndcoord(const arm_compute::Window &win) +{ + return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())}, + {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())}, + {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())}, + {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())}, + {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())}, + {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}}; +} + +} //namespace arm_gemm diff --git a/src/cpu/kernels/assembly/arm_gemm_local.hpp b/src/cpu/kernels/assembly/arm_gemm_local.hpp new file mode 100644 index 0000000000..78e0adf31f --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm_local.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* This file is used to configure integration-specific aspects of arm_gemm into ACL */ + +#include "arm_compute/core/CPP/CPPTypes.h" + +using CPUModel = arm_compute::CPUModel; +using CPUInfo = arm_compute::CPUInfo; diff --git a/src/cpu/kernels/assembly/convolution_parameters.hpp b/src/cpu/kernels/assembly/convolution_parameters.hpp new file mode 100644 index 0000000000..0c1ae58902 --- /dev/null +++ b/src/cpu/kernels/assembly/convolution_parameters.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <cstdint> + +namespace arm_gemm +{ +/* + * Parameter set for "convolution" type GEMM. + * + * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if + * an im2row had been performed on the input tensor to generate the operand + * matrix, but instead this structure describes the convolution parameters + * such that this can be done on the fly. + * + * The parameters describe the convolution details - the notional shape of + * the input and output tensors, whether padding is to be applied, the size + * of the kernel and a constant value to be used for padding (needed for + * quantized tensors). + * + * The second part describes the layout of the input tensor in memory, which + * is assumed to be in NHWC format. This consists of a base pointer and + * strides for columns, rows and batches. 'multis' are not supported for + * convolution type GEMMs. + */ +struct ConvolutionParameters +{ + int64_t input_width; + int64_t input_height; + int64_t input_channels; + int64_t kernel_width; + int64_t kernel_height; + int64_t output_width; + int64_t output_height; + int64_t output_stride_w; + int64_t output_stride_h; + // output_channels not included as they do not affect the input. + int64_t padding_top; + int64_t padding_left; + float padding_value; +}; + +} // namespace arm_gemm diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp new file mode 100644 index 0000000000..45d1e43274 --- /dev/null +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2017-2021,2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP +#define ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP + +#pragma once + +#include "convolution_parameters.hpp" +#include "ndrange.hpp" +#include <cstddef> + +namespace arm_gemm +{ +// Avoid circular dependency with arm_gemm.hpp +struct GemmConfig; + +// Abstract class for the GEMM/GEMV functions. +// +// GEMM implementations may be "native" (never require any input +// permutation), "pretransposed" (require permutation up-front) or require +// working space (permute as they go along). This interface should support +// all of them. + +// The real GemmCommon class is templated based on the operand and return +// type. This is an interface class which is independent of those types. +class IGemmCommon +{ +public: + /* Pass in the pointers to the arrays to be operated on and their + * strides. This "generic" version uses void *s, the preferred version + * is the one provided by templated GemmCommon (below) which takes + * appropriately typed pointers. If B is pretransposed (see below) then + * the settings for B here are ignored. + */ + virtual void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) = 0; + + /** @returns an ndrange containing ranges of the compute space which can be + * broken up and parallelised over + */ + virtual ndrange_t get_window_size() const = 0; + + /* The maximum thread count is specified when the GEMM is created. Some + * implementations need to know how many threads will actually run in + * order to work properly. + * + * In some cases, after creating the GEMM the number of threads needs to + * be reduced (e.g. not enough work to split across threads). This + * method allows the number of actual threads to be run to be set (must + * be equal or lower). + * + * This has an empty default implementation, as GEMMs which don't care + * about thread count can safely ignore this. + */ + virtual void set_nthreads(int){}; + + /* Whether this GEMM can be dynamically scheduled or not. */ + virtual bool supports_dynamic_scheduling() const + { + return false; + } + + /** Main execute member fucntion + * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() + * @param [in] thread_locator where are we inside of the thread space + * @param [in] threadid a unique threadid + */ + virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0; + + /*** Working space interface (optional) ***/ + /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ + virtual size_t get_working_size() const + { + return 0; + } + /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void set_working_space(void *){}; + + /*** "Pretransposed" interface (optional) ***/ + /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ + virtual bool B_is_pretransposed() const + { + return false; + } + /* Does pretranspose still need to be done? */ + virtual bool B_pretranspose_required() const + { + return false; + } + /* Does pretranspose accept the transposed flag? */ + virtual bool B_pretranspose_supports_transpose() const + { + return false; + } + /* Total number of bytes of space needed for pretransposed arrays. */ + virtual size_t get_B_pretransposed_array_size() const + { + return 0; + } + /* Amount of work for the threaded cases */ + virtual size_t get_B_pretranspose_window_size() const + { + return 1; + } + /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ + /* The "real" version of this depends on the templated operand type (see below). */ + virtual void pretranspose_B_array_generic(void *, const void *, const int, const int, bool) = 0; + /* Threaded version with window start/end parameters */ + virtual void + pretranspose_B_array_part_generic(void *, const void *, const int, const int, bool, const size_t, const size_t) = 0; + + /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ + virtual void set_pretransposed_B_data(void *) + { + } + + /*** "Quantized bias" interface (optional) ***/ + /* Set the bias vector for quantized GEMMs */ + virtual void set_quantized_bias(const int32_t *, size_t) + { + } + + /*** Indirect interface (optional) ***/ + /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers, + * multis * batches * kernel_points */ + virtual void set_indirect_parameters_generic(size_t, const void *const *const *) + { + } + + /*** Convolution interface (optional) ***/ + /* Set the convolution parameters. */ + virtual void set_convolution_parameters(ConvolutionParameters) + { + } + + /*** Dequanize scale interface (optional) ***/ + /* Set the dequantize scale for GEMMs when converting from int to float (float out = scale * float(int out) ) */ + virtual void set_dequantize_scale(const float) + { + } + + /*** Introspection interface ***/ + /* Get the configuration of this GEMM */ + virtual GemmConfig get_config() = 0; + + // Destructor + virtual ~IGemmCommon() + { + } +}; + +/* "Real" GemmCommon class which is templated on the operand and return types. + * + * In addition to correctly typed versions of the functions that operate on + * operand and return data, this class provides a default implementation of + * 'set_arrays' to capture the provided arguments in protected class + * members, as essentially any implementation will need these. + */ +template <typename To, typename Tr> +class GemmCommon : public IGemmCommon +{ +protected: + const To *_Aptr = nullptr; + int _lda = 0; + int _A_batch_stride = 0; + int _A_multi_stride = 0; + const To *_Bptr = nullptr; + int _ldb = 0; + int _B_multi_stride = 0; + Tr *_Cptr = nullptr; + int _ldc = 0; + int _C_batch_stride = 0; + int _C_multi_stride = 0; + const Tr *_bias = nullptr; + int _bias_multi_stride = 0; + +public: + /* Pass in the pointers to the arrays to be operated on and their + * strides (templated version with appropriate types). */ + virtual void set_arrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const To *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + /* no row or batch stride needed */ const int bias_multi_stride) + { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + _bias = bias; + _bias_multi_stride = bias_multi_stride; + } + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) override + { + set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb, + B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, + static_cast<const Tr *>(bias), bias_multi_stride); + } + + /*** "Pretransposed" interface ***/ + + /* Compute col sums over all columns */ + virtual void requantize_bias(void *, const To *, const int, const int){}; + + /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ + /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ + virtual void pretranspose_B_array(void *, const To *, const int, const int, bool){}; + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void pretranspose_B_array_generic( + void *out, const void *in, const int row_stride, const int multi_stride, bool transposed) override + { + pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride, transposed); + } + + /* Threaded versions of the above. + * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and + * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only + * legal values for start and end are 0 and 1 respectively. */ + virtual void pretranspose_B_array_part( + void *out, const To *in, const int row_stride, const int multi_stride, bool transposed, size_t, size_t) + { + pretranspose_B_array(out, in, row_stride, multi_stride, transposed); + }; + + void pretranspose_B_array_part_generic(void *out, + const void *in, + const int row_stride, + const int multi_stride, + bool transposed, + size_t start, + size_t end) override + { + pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, transposed, start, end); + } + + /*** Indirect interface ***/ + virtual void set_indirect_parameters(size_t, const To *const *const *) + { + } + + void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override + { + set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr)); + } +}; + +} // namespace arm_gemm + +#endif // ACL_SRC_CPU_KERNELS_ASSEMBLY_GEMM_COMMON_HPP diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp new file mode 100644 index 0000000000..baccdc0d88 --- /dev/null +++ b/src/cpu/kernels/assembly/ndrange.hpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <algorithm> +#include <array> +#include <cassert> +#include <initializer_list> + +namespace arm_gemm +{ +template <unsigned int D> +class NDRange +{ +private: + std::array<unsigned int, D> m_sizes{}; + std::array<unsigned int, D> m_totalsizes{}; + + class NDRangeIterator + { + private: + const NDRange &m_parent; + unsigned int m_pos = 0; + unsigned int m_end = 0; + + public: + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) + { + } + + bool done() const + { + return (m_pos >= m_end); + } + + unsigned int dim(unsigned int d) const + { + unsigned int r = m_pos; + + if (d < (D - 1)) + { + r %= m_parent.m_totalsizes[d]; + } + + if (d > 0) + { + r /= m_parent.m_totalsizes[d - 1]; + } + + return r; + } + + bool next_dim0() + { + m_pos++; + + return !done(); + } + + bool next_dim1() + { + m_pos += m_parent.m_sizes[0] - dim(0); + + return !done(); + } + + unsigned int dim0_max() const + { + unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); + + return dim(0) + offset; + } + }; + + void set_totalsizes() + { + unsigned int t = 1; + + for (unsigned int i = 0; i < D; i++) + { + if (m_sizes[i] == 0) + { + m_sizes[i] = 1; + } + + t *= m_sizes[i]; + + m_totalsizes[i] = t; + } + } + +public: + NDRange &operator=(const NDRange &rhs) = default; + NDRange(const NDRange &rhs) = default; + + template <typename... T> + NDRange(T... ts) : m_sizes{ts...} + { + set_totalsizes(); + } + + NDRange(const std::array<unsigned int, D> &n) : m_sizes(n) + { + set_totalsizes(); + } + + NDRangeIterator iterator(unsigned int start, unsigned int end) const + { + return NDRangeIterator(*this, start, end); + } + + unsigned int total_size() const + { + return m_totalsizes[D - 1]; + } + + unsigned int get_size(unsigned int v) const + { + return m_sizes[v]; + } +}; + +/** NDCoordinate builds upon a range, but specifies a starting position + * in addition to a size which it inherits from NDRange + */ +template <unsigned int N> +class NDCoordinate : public NDRange<N> +{ + using int_t = unsigned int; + using ndrange_t = NDRange<N>; + + std::array<int_t, N> m_positions{}; + +public: + NDCoordinate &operator=(const NDCoordinate &rhs) = default; + NDCoordinate(const NDCoordinate &rhs) = default; + NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list) + { + std::array<int_t, N> sizes{}; + + std::size_t i = 0; + for (auto &p : list) + { + m_positions[i] = p.first; + sizes[i++] = p.second; + } + + //update the parents sizes + static_cast<ndrange_t &>(*this) = ndrange_t(sizes); + } + + int_t get_position(int_t d) const + { + assert(d < N); + + return m_positions[d]; + } + + void set_position(int_t d, int_t v) + { + assert(d < N); + + m_positions[d] = v; + } + + int_t get_position_end(int_t d) const + { + return get_position(d) + ndrange_t::get_size(d); + } +}; //class NDCoordinate + +using ndrange_t = NDRange<6>; +using ndcoord_t = NDCoordinate<6>; + +} // namespace arm_gemm diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp new file mode 100644 index 0000000000..dbdec5fb50 --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) +{ + return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp new file mode 100644 index 0000000000..0224b3406a --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) +{ + return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp new file mode 100644 index 0000000000..5a2939b587 --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h" + +#include "src/cpu/CpuTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) + +{ + const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; + const size_t deltas_width = deltas->info()->tensor_shape()[0]; + const int img_h = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f); + const int img_w = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f); + + const auto scale_after = (bbinfo.apply_scale() ? bbinfo.scale() : 1.f); + const auto scale_before = bbinfo.scale(); + const auto offset = (bbinfo.correct_transform_coords() ? 1.f : 0.f); + + auto pred_ptr = + reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); + auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); + + const auto boxes_qinfo = boxes->info()->quantization_info().uniform(); + const auto deltas_qinfo = deltas->info()->quantization_info().uniform(); + const auto pred_qinfo = pred_boxes->info()->quantization_info().uniform(); + + Iterator box_it(boxes, window); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr()); + const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); + const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); + const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); + const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); + const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; + const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; + const float ctr_x = (b0 / scale_before) + 0.5f * width; + const float ctr_y = (b1 / scale_before) + 0.5f * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0]; + const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1]; + float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2]; + float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3]; + // Clip dw and dh + dw = std::min(dw, bbinfo.bbox_xform_clip()); + dh = std::min(dh, bbinfo.bbox_xform_clip()); + // Determine the predictions + const float pred_ctr_x = dx * width + ctr_x; + const float pred_ctr_y = dy * height + ctr_y; + const float pred_w = std::exp(dw) * width; + const float pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); + pred_ptr[delta_id + 1] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); + pred_ptr[delta_id + 2] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), + pred_qinfo); + pred_ptr[delta_id + 3] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), + pred_qinfo); + } + }, + box_it); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h new file mode 100644 index 0000000000..d8013c6227 --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H +#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void bounding_box_transform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) +{ + const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; + const size_t deltas_width = deltas->info()->tensor_shape()[0]; + const int img_h = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f); + const int img_w = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f); + + const auto scale_after = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1)); + const auto scale_before = T(bbinfo.scale()); + ARM_COMPUTE_ERROR_ON(scale_before <= 0); + const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f)); + + auto pred_ptr = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); + auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); + + Iterator box_it(boxes, window); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto ptr = reinterpret_cast<T *>(box_it.ptr()); + const auto b0 = *ptr; + const auto b1 = *(ptr + 1); + const auto b2 = *(ptr + 2); + const auto b3 = *(ptr + 3); + const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); + const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); + const T ctr_x = (b0 / scale_before) + T(0.5f) * width; + const T ctr_y = (b1 / scale_before) + T(0.5f) * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]); + const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]); + T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]); + T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]); + // Clip dw and dh + dw = std::min(dw, T(bbinfo.bbox_xform_clip())); + dh = std::min(dh, T(bbinfo.bbox_xform_clip())); + // Determine the predictions + const T pred_ctr_x = dx * width + ctr_x; + const T pred_ctr_y = dy * height + ctr_y; + const T pred_w = std::exp(dw) * width; + const T pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); + pred_ptr[delta_id + 1] = + scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); + pred_ptr[delta_id + 2] = + scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); + pred_ptr[delta_id + 3] = + scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); + } + }, + box_it); +} + +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp new file mode 100644 index 0000000000..64ef815195 --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/boundingboxtransform/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qu16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) +{ + return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h new file mode 100644 index 0000000000..4da725a257 --- /dev/null +++ b/src/cpu/kernels/boundingboxtransform/list.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H +#define SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \ + void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \ + const Window &window) +DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform); +DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform); +DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform); +#undef DECLARE_BOUNDINGBOXTRANFORM_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_BOUNDINGBOXTRANFORM_LIST_H diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp new file mode 100644 index 0000000000..2897f4b242 --- /dev/null +++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/CPP/CPPTypes.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/kernels/cast/list.h" +#include "src/cpu/kernels/CpuCastKernel.h" +#include "support/SaturateCast.h" + +#include "arm_neon.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_qasymm8_signed_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_UNUSED(_policy); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); +} + +void neon_s32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_UNUSED(_policy); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = { + {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}}; + + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); +} + +void neon_fp32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_UNUSED(_policy); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}}; + + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); +} + +void neon_fp16_to_other_dt_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_UNUSED(_policy); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + switch (_dst->info()->data_type()) + { + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = {{ + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8), + }}; + + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), + vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = {{ + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8), + }}; + + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), + vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion F16 -> F32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); + vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion F16 -> S32 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } +} + +void neon_u8_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_UNUSED(_policy); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + /* Up-conversion U8 -> F16 */ + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); + return; +} + +} // namespace cpu +} // namespace arm_compute +#endif /* #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h new file mode 100644 index 0000000000..5e634fc170 --- /dev/null +++ b/src/cpu/kernels/cast/list.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CAST_LIST_H +#define SRC_CORE_NEON_KERNELS_CAST_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_CAST_KERNEL(func_name) \ + void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \ + const Window &window) + +DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast); +DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast); +DECLARE_CAST_KERNEL(neon_fp16_to_other_dt_cast); +DECLARE_CAST_KERNEL(neon_s32_to_fp16_cast); +DECLARE_CAST_KERNEL(neon_qasymm8_signed_to_fp16_cast); +DECLARE_CAST_KERNEL(neon_fp32_to_bfloat16_cast); +DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast); + +#undef DECLARE_CAST_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h new file mode 100644 index 0000000000..082c60be29 --- /dev/null +++ b/src/cpu/kernels/conv3d/neon/list.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CONV3D_LIST_H +#define SRC_CORE_NEON_KERNELS_CONV3D_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/conv3d/neon/quantized.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void directconv3d_float_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + const ITensor *src = src0; + const ITensor *weights = src1; + const ITensor *biases = src2; + + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + // Scalar quantities (N D H W Cin) + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_d = src->info()->strides_in_bytes()[3] / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[4] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + // Kernel info (D H W Cin Cout) + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes()[2] / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes()[3] / element_size; + const unsigned int kernel_stride_d = weights->info()->strides_in_bytes()[4] / element_size; + const int kernel_dim_w = weights->info()->dimension(2); + const int kernel_dim_h = weights->info()->dimension(3); + const int kernel_dim_d = weights->info()->dimension(4); + + // Convolution padding and stride + const int conv_pad_top = conv_info.padding.top; + const int conv_pad_left = conv_info.padding.left; + const int conv_pad_front = conv_info.padding.front; + const int conv_stride_w = conv_info.stride.width; + const int conv_stride_h = conv_info.stride.height; + const int conv_stride_d = conv_info.stride.depth; + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimW, Window::Dimension(0, 1, 1)); + window_w.set(4, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + const T *biases_ptr = nullptr; + if (biases != nullptr) + { + biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); + } + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* + * This is the loop in the weights, and it goes along OFM (output feature map) + */ + const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + T out_temp = static_cast<T>(0); + T *out_ptr = reinterpret_cast<T *>(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) + { + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } + } + } + *(reinterpret_cast<T *>(out_ptr + id_w[0])) = + (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp; + }, + wei); + }, + out); +} + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h new file mode 100644 index 0000000000..f0fc9b5a71 --- /dev/null +++ b/src/cpu/kernels/conv3d/neon/quantized.h @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H +#define SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void directconv3d_quantized_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) +{ + const ITensor *src = src0; + const ITensor *weights = src1; + const ITensor *biases = src2; + + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + using q16_t = typename wrapper::traits::promote_t<T>; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; + + const int32_t input_offset = -src->info()->quantization_info().uniform().offset; + const float input_scale = src->info()->quantization_info().uniform().scale; + const int32_t weights_offset = -weights->info()->quantization_info().uniform().offset; + const float weights_scale = weights->info()->quantization_info().uniform().scale; + const int32_t output_offset = dst->info()->quantization_info().uniform().offset; + const float output_scale = dst->info()->quantization_info().uniform().scale; + + int32_t output_multiplier = 0; + int32_t output_shift = 0; + const float multiplier = input_scale * weights_scale / output_scale; + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + + // Scalar quantities (N D H W Cin) + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_d = src->info()->strides_in_bytes()[3] / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[4] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + // Kernel info (D H W Cin Cout) + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes()[2] / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes()[3] / element_size; + const unsigned int kernel_stride_d = weights->info()->strides_in_bytes()[4] / element_size; + const int kernel_dim_w = weights->info()->dimension(2); + const int kernel_dim_h = weights->info()->dimension(3); + const int kernel_dim_d = weights->info()->dimension(4); + + // Convolution padding and stride + const int conv_pad_top = conv_info.padding.top; + const int conv_pad_left = conv_info.padding.left; + const int conv_pad_front = conv_info.padding.front; + const int conv_stride_w = conv_info.stride.width; + const int conv_stride_h = conv_info.stride.height; + const int conv_stride_d = conv_info.stride.depth; + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimW, Window::Dimension(0, 1, 1)); + window_w.set(4, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + const int32_t *biases_ptr = nullptr; + if (biases != nullptr) + { + biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); + } + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* + * This is the loop in the weights, and it goes along OFM (output feature map) + */ + const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + int32_t acc = static_cast<int32_t>(0); + T *out_ptr = reinterpret_cast<T *>(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) + { + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + + q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + + q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + + const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec)); + const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec)); + const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec)); + const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec)); + + src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0))); + src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0))); + src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1))); + src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1))); + + wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0))); + wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0))); + wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1))); + wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1))); + + acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0); + acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1); + acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2); + acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3); + } +#if defined(__aarch64__) + acc += wrapper::vaddv(acc_q32_0); + acc += wrapper::vaddv(acc_q32_1); + acc += wrapper::vaddv(acc_q32_2); + acc += wrapper::vaddv(acc_q32_3); +#else // __aarch64__ + auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); + + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); + + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); + + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); + +#endif // __aarch64__ + + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover) + input_offset; + const auto w_val = *(weights_ptr_mover) + weights_offset; + acc += src_val * w_val; + } + } + } + } + + if (biases) + { + acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]); + } + + T out_val = + finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false); + *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val; + }, + wei); + }, + out); +} +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h new file mode 100644 index 0000000000..8fb7ad2087 --- /dev/null +++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H +#define SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +inline float32x4_t load_as_f32(T *ptr) +{ + ARM_COMPUTE_UNUSED(ptr); + ARM_COMPUTE_ERROR("Type not supported."); +} + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +template <> +inline float32x4_t load_as_f32(float16_t *ptr) +{ + return vcvt_f32_f16(wrapper::vload(ptr)); +} +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ + +template <> +inline float32x4_t load_as_f32(float *ptr) +{ + return wrapper::vloadq(ptr); +} + +template <> +inline float32x4_t load_as_f32(int32_t *ptr) +{ + return vcvtq_f32_s32(wrapper::vloadq(ptr)); +} + +template <> +inline float32x4_t load_as_f32(uint32_t *ptr) +{ + return vcvtq_f32_u32(wrapper::vloadq(ptr)); +} + +template <> +inline float32x4_t load_as_f32(int16_t *ptr) +{ + return vcvtq_f32_s32(vmovl_s16(wrapper::vload(ptr))); +} + +template <> +inline float32x4_t load_as_f32(uint16_t *ptr) +{ + return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr))); +} + +template <> +inline float32x4_t load_as_f32(uint8_t *ptr) +{ + return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr))))); +} +} // namespace cpu +} // namespace arm_compute + +#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp new file mode 100644 index 0000000000..3739c9d4e0 --- /dev/null +++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/crop/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp new file mode 100644 index 0000000000..f665c3652c --- /dev/null +++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/crop/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h new file mode 100644 index 0000000000..b90ba9ddbf --- /dev/null +++ b/src/cpu/kernels/crop/generic/neon/impl.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CROP_IMPL_H +#define SRC_CORE_NEON_KERNELS_CROP_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/crop/generic/neon/crop_helper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + // Reverse elements if width flipped. + if (is_width_flipped) + { + // Collapse first dimension if possible. + if (input_has_single_channel) + { + int32_t x = output_width_start; + Coordinates negative_offset(input_offset); + negative_offset.set(1, negative_offset[1] - window_step_x + 1); + for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) + { + auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset))); + + in = wrapper::vrev64(in); + in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + + wrapper::vstore(output_ptr + x, in); + } + input_offset[1] = negative_offset[1] + window_step_x - 1; + for (; x < output_width_limit; ++x, --input_offset[1]) + { + *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + } + } + else + { + for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) + { + input_offset.set(0, 0); + int32_t c = 0; + for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; + c += window_step_x, input_offset[0] += window_step_x) + { + auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in); + } + for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0]) + { + *(output_ptr + x * output->info()->dimension(0) + c) = + static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + } + } + } + } + else + { + // Use memcpy if the elements don't need converting to float. + if (std::is_same<T, float>::value) + { + memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)), + reinterpret_cast<const void *>(input->ptr_to_element(input_offset)), + (output_width_limit - output_width_start) * output->info()->dimension(0) * + output->info()->element_size()); + } + else + { + int32_t x = 0; + int32_t limit = + (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); + float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); + for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) + { + auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + wrapper::vstore(output_start_ptr + x, in); + } + for (; x < limit; ++x, ++input_offset[0]) + { + *(output_start_ptr + x) = + static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + } + } + } +} +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_CROP_IMPL_H diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp new file mode 100644 index 0000000000..602434f54f --- /dev/null +++ b/src/cpu/kernels/crop/generic/neon/integer.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/crop/generic/neon/impl.h" +#include "src/cpu/kernels/crop/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void u8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} + +void u16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} + +void u32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} + +void s8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} + +void s16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} + +void s32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) +{ + return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h new file mode 100644 index 0000000000..9cb7726203 --- /dev/null +++ b/src/cpu/kernels/crop/list.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_CROP_LIST_H +#define SRC_CORE_NEON_KERNELS_CROP_LIST_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/crop/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_CROP_KERNEL(func_name) \ + void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \ + int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, \ + bool input_has_single_channel, bool is_width_flipped) + +DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window); +DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window); +DECLARE_CROP_KERNEL(s8_in_bounds_crop_window); +DECLARE_CROP_KERNEL(s16_in_bounds_crop_window); +DECLARE_CROP_KERNEL(s32_in_bounds_crop_window); +DECLARE_CROP_KERNEL(u8_in_bounds_crop_window); +DECLARE_CROP_KERNEL(u16_in_bounds_crop_window); +DECLARE_CROP_KERNEL(u32_in_bounds_crop_window); + +#undef DECLARE_CROP_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_CROP_LIST_H diff --git a/src/cpu/kernels/depth_to_space/list.h b/src/cpu/kernels/depth_to_space/list.h new file mode 100644 index 0000000000..9d0cd1e740 --- /dev/null +++ b/src/cpu/kernels/depth_to_space/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H +#define ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_DEPTHTOSPACE_KERNEL(func_name) \ + void func_name(const uint8_t *src, uint8_t *dst, const uintptr_t src_shape[4], const uintptr_t src_strides[4], \ + const uintptr_t dst_strides[4], uintptr_t element_size, uintptr_t block_size) + +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nhwc_any); +DECLARE_DEPTHTOSPACE_KERNEL(depth_to_space_nchw_any); + +#undef DECLARE_DEPTHTOSPACE_KERNEL + +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DEPTH_TO_SPACE_LIST_H diff --git a/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp new file mode 100644 index 0000000000..0277690112 --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nchw/any/impl.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include <cstdint> +#include <cstring> + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nchw_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto dst_channels = src_shape[2] / (block_size * block_size); + const auto src_block_col_stride = dst_channels * src_strides[2]; + const auto src_block_row_stride = block_size * dst_channels * src_strides[2]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_channel_ptr = src_batch_ptr; + auto *dst_channel_ptr = dst_batch_ptr; + + for (uintptr_t channel = 0; channel < dst_channels; ++channel) + { + auto *src_height_block_ptr = src_channel_ptr; + auto *dst_row_ptr = dst_channel_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[1]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_col_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[0]; ++width_block) + { + auto *src_block_col_ptr = src_width_block_ptr; + + for (uintptr_t block_col = 0; block_col < block_size; ++block_col) + { + // The source pointer is accumulated as: + // + // src_block_col_ptr = + // src + + // batch * dst_strides[3] + + // (channel + (block_row * block_size + block_col) * dst_channels) * src_strides[2] + + // height_block * src_strides[1] + + // width_block * element_size; + // + // The destination pointer is accumuated as: + // + // dst_col_ptr = + // dst + + // batch * dst_strides[3] + + // channel * dst_strides[2] + + // (height_block * block_size + block_row) * dst_strides[1] + + // (width_block * block_size + block_col) * element_size + + std::memcpy(dst_col_ptr, src_block_col_ptr, element_size); + + src_block_col_ptr += src_block_col_stride; + dst_col_ptr += element_size; + } + + src_width_block_ptr += element_size; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[1]; + } + + src_height_block_ptr += src_strides[1]; + } + + src_channel_ptr += src_strides[2]; + dst_channel_ptr += dst_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp new file mode 100644 index 0000000000..b1c84599dc --- /dev/null +++ b/src/cpu/kernels/depth_to_space/nhwc/any/impl.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" + +#include <cstdint> +#include <cstring> + +namespace arm_compute +{ +namespace cpu +{ + +void depth_to_space_nhwc_any( // + const uint8_t *src, + uint8_t *dst, + const uintptr_t src_shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + uintptr_t element_size, + uintptr_t block_size) +{ + ARM_COMPUTE_ERROR_ON(src_strides[0] != element_size); + ARM_COMPUTE_ERROR_ON(dst_strides[0] != element_size); + + const auto src_block_row_stride = (src_shape[0] / block_size) * element_size; + const auto dst_width_block_stride = block_size * dst_strides[1]; + + auto *src_batch_ptr = src; + auto *dst_batch_ptr = dst; + + for (uintptr_t batch = 0; batch < src_shape[3]; ++batch) + { + auto *src_height_block_ptr = src_batch_ptr; + auto *dst_row_ptr = dst_batch_ptr; + + for (uintptr_t height_block = 0; height_block < src_shape[2]; ++height_block) + { + auto *src_block_row_ptr = src_height_block_ptr; + + for (uintptr_t block_row = 0; block_row < block_size; ++block_row) + { + auto *src_width_block_ptr = src_block_row_ptr; + auto *dst_width_block_ptr = dst_row_ptr; + + for (uintptr_t width_block = 0; width_block < src_shape[1]; ++width_block) + { + // The source pointer is accumulated as: + // + // src_width_block_ptr = + // src + + // batch * src_strides[3] + + // height_block * src_strides[2] + + // width_block * src_strides[1] + + // block_row * (src_shape[0] / block_size) * element_size; + // + // The destination pointer is accumulated as: + // + // dst_width_block_ptr = + // dst + + // batch * dst_strides[3] + + // (height_block * block_size + block_row) * dst_strides[2] + + // width_block * block_size * dst_strides[1]; + + std::memcpy(dst_width_block_ptr, src_width_block_ptr, src_block_row_stride); + + src_width_block_ptr += src_strides[1]; + dst_width_block_ptr += dst_width_block_stride; + } + + src_block_row_ptr += src_block_row_stride; + dst_row_ptr += dst_strides[2]; + } + + src_height_block_ptr += src_strides[2]; + } + + src_batch_ptr += src_strides[3]; + dst_batch_ptr += dst_strides[3]; + } +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp new file mode 100644 index 0000000000..293e606d81 --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info); +} +} // namespace cpu +} // namespace arm_compute +#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp new file mode 100644 index 0000000000..c6fa4790b7 --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp new file mode 100644 index 0000000000..d08e973968 --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" + +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/function_info/ConvolutionInfo.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b) +{ + return vqrdmulhq_n_s32(a, b); +} + +inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b) +{ + return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); +} + +inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent) +{ + const int32x2_t shift = vdup_n_s32(-exponent); + const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); + const int32x2_t fixed = vqadd_s32(x, fixup); + return vrshl_s32(fixed, shift); +} + +inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) +{ + const int32x2_t xs = vdup_n_s32(x); + return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); +} + +namespace +{ +template <typename T, typename TW> +void depthwise_loop_multiplier1_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT +{ + ARM_COMPUTE_UNUSED(output_multiplier, output_shift); + constexpr auto element_per_vector = vector_size / sizeof(T); + using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; + using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type; + using AccType = int32_t; + using AccArrayType = std::array<AccType, element_per_vector>; + + const auto out_of_bound_value = + PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{}); + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); + + const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; + const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; + const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; + const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; + + Window execution_window = window; + execution_window.set(Window::DimX, dim_single_unit_step); + + Window win_input = window; + win_input.set(Window::DimX, dim_manual_loop); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = win_input; + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set(Window::DimX, dim_manual_loop); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if (has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + auto const base_weights_ptr = weights_it.ptr(); + size_t x = run_info.x_start; + + for (; x < run_info.x_leftover_start; x += run_info.x_step) + { + AccArrayType acc{}; + AccArrayType in_sum{}; + AccArrayType we_sum{}; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; + + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) + : out_of_bound_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) += input_vals[i] * weights_vals[i]; + in_sum.at(i) += input_vals[i]; + we_sum.at(i) += weights_vals[i]; + } + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{}); + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) -= in_sum.at(i) * weights_qoffset; + acc.at(i) -= we_sum.at(i) * input_qoffset; + acc.at(i) += k_offset; + + if (has_biases) + { + acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x); + } + + const int32_t out_mul = output_multiplier.at(x + i); + const int32_t out_shift = output_shift.at(x + i); + if (out_shift < 0) + { + acc.at(i) = + saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + + output_qoffset; + } + out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i))); + } + + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals); + } + + // left-over + for (; x < run_info.x_end; ++x) + { + AccType acc = 0; + AccType in_sum = 0; + AccType we_sum = 0; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; + + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region + ? *reinterpret_cast<T *>(input_it.ptr() + + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) + : out_of_bound_value; + const auto weights_val = + *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc += input_val * weights_val; + in_sum += input_val; + we_sum += weights_val; + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + T out_vals{0}; + + acc -= in_sum * weights_qoffset; + acc -= we_sum * input_qoffset; + acc += k_offset; + + if (has_biases) + { + acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x); + } + + const int32_t out_mul = output_multiplier.at(x); + const int32_t out_shift = output_shift.at(x); + + if (out_shift < 0) + { + acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + } + + out_vals = static_cast<T>(utility::clamp<AccType, T>(acc)); + *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals; + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void depthwise_loop_generic_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT +{ + using AccType = int32_t; + + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + const auto out_of_bound_value = + PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + + const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; + const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; + const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; + const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if (has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector<AccType> acc(depth_multiplier, 0); + std::vector<AccType> we_sum(depth_multiplier, 0); + AccType in_sum = 0; + + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), + run_info.input_max_offset))) + : out_of_bound_value; + + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) += input_val * weights_val; + + we_sum.at(m) += weights_val; + } + + offs += dilation.x() * run_info.input_stride_y; + in_sum += input_val; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + for (size_t m = 0; m < depth_multiplier; ++m) + { + acc.at(m) -= in_sum * weights_qoffset; + acc.at(m) -= we_sum.at(m) * input_qoffset; + acc.at(m) += k_offset; + + if (has_biases) + { + acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + } + + const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); + const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); + if (out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + + output_qoffset; + } + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = + static_cast<T>(utility::clamp<AccType, T>(acc.at(m))); + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT +{ + constexpr int half_vec = vector_size / 2; + + using AccType = int32_t; + using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type; + using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type; + using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type; + + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + const auto input_qoffset_vec = wrapper::vreinterpret( + wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{}))); + const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl( + wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{}))); + const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, + arm_compute::wrapper::traits::vector_128_tag{}); + + const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{}); + const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{}); + const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{}); + + const auto out_mul = output_multiplier.at(0); + const auto out_shift = output_shift.at(0); + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if (has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + std::vector<AccVectorType> acc0(depth_multiplier / vector_size); + std::vector<AccVectorType> acc1(depth_multiplier / vector_size); + + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::fill(begin(acc0), end(acc0), zero); + std::fill(begin(acc1), end(acc1), zero); + + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) + { + const int32_t current_h = input_z + h * dilation.y(); + if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height)) + { + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const int32_t current_w = input_y + w * dilation.x(); + if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width)) + { + const auto input_8x8 = wrapper::vdup_n( + *(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), + TagType{}); + const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); + const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); + + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>( + weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); + const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); + + acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), + wrapper::vgetlow(weights_no_offs)); + acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), + wrapper::vgethigh(weights_no_offs)); + } + } + + offs += dilation.x() * run_info.input_stride_y; + } + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + if (has_biases) + { + const auto bias_val0 = + wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + const auto bias_val1 = wrapper::vloadq( + reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); + + acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); + acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); + } + + if (out_shift < 0) + { + acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + } + else + { + acc0.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), + output_qoffset_vec); + } + + acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); + acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); + + const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i))); + + if (std::is_same<T, uint8_t>::value) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), + wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); + } + else + { + wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), + wrapper::vqmovn(out_val)); + } + } + }, + input_it, weights_it, biases_it, output_it); +} +} // namespace + +template <typename T, typename TW> +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + PadStrideInfo conv_info = info.pad_stride_info; + unsigned int depth_multiplier = info.depth_multiplier; + Size2D dilation = info.dilation; + std::vector<int> output_multiplier; + std::vector<int> output_shift; + + const auto input_scale = src->info()->quantization_info().uniform().scale; + const auto output_scale = dst->info()->quantization_info().uniform().scale; + auto weights_scale = weights->info()->quantization_info().scale(); + + if (!is_data_type_quantized_per_channel(weights->info()->data_type())) + { + for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i) + { + weights_scale.push_back(weights_scale.front()); + } + } + + for (const auto &s : weights_scale) + { + int32_t out_mult = 0; + int32_t out_shift = 0; + const float multiplier = input_scale * s / output_scale; + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift); + + output_multiplier.push_back(out_mult); + output_shift.push_back(out_shift); + } + + if (depth_multiplier == 1) + { + depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, + output_shift, window, has_biases); + } + else + { + const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0); + const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type())); + + if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8) + { + depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, + depth_multiplier, output_multiplier, output_shift, window, + has_biases); + } + else + { + depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, + output_multiplier, output_shift, window, has_biases); + } + } +} +template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h new file mode 100644 index 0000000000..3fa5c58c3c --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H +#define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +struct ConvolutionInfo; + +namespace cpu +{ +constexpr auto data_layout = DataLayout::NHWC; +const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + +constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0); +constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1); +constexpr size_t vector_size = 8; + +struct DepthwiseConvolutionRunInfo +{ + const size_t num_read_elements_per_iteration; + const uint32_t x_start; + const uint32_t x_end; + const uint32_t x_step; + const uint32_t x_leftover_start; + const size_t input_stride_y; + const size_t input_stride_z; + const size_t input_max_offset; + const size_t weights_width; + const size_t weights_height; + const size_t weights_stride_y; + const size_t weights_stride_z; + const size_t conv_stride_x; + const size_t conv_stride_y; + const size_t conv_pad_left; + const size_t conv_pad_top; + const size_t input_height; + const size_t input_width; + const size_t input_depth; + + DepthwiseConvolutionRunInfo(const ITensorInfo &input, + const ITensorInfo &weights, + const PadStrideInfo &conv_info, + const Window &w, + uint32_t depth_multiplier = 1) // NOLINT + : num_read_elements_per_iteration( + (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), + x_start(w.x().start()), + x_end(w.x().end()), + x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)), + x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))), + input_stride_y(input.strides_in_bytes().y()), + input_stride_z(input.strides_in_bytes().z()), + input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - + (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), + weights_width(weights.dimension(width_idx)), + weights_height(weights.dimension(height_idx)), + weights_stride_y(weights.strides_in_bytes().y()), + weights_stride_z(weights.strides_in_bytes().z()), + conv_stride_x(conv_info.stride().first), + conv_stride_y(conv_info.stride().second), + conv_pad_left(conv_info.pad_left()), + conv_pad_top(conv_info.pad_top()), + input_height(input.dimension(height_idx)), + input_width(input.dimension(width_idx)), + input_depth(input.dimension(channel_idx)) + { + } +}; + +inline bool is_valid_input_region(int32_t base_w, + uint32_t base_h, + uint32_t w, + uint32_t h, + const DepthwiseConvolutionRunInfo &run_info, + const Size2D &dilation) +{ + const int32_t current_h = base_h + h * dilation.y(); + const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height); + + const int32_t current_w = base_w + w * dilation.x(); + const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width); + + return is_valid_h && is_valid_w; +} + +template <typename T> +void depthwise_loop_multiplier1_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const Window &window, + bool has_biases) +{ + constexpr auto element_per_vector = vector_size / sizeof(T); + using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; + using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type; + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); + + const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{}); + + Window execution_window = window; + execution_window.set(Window::DimX, dim_single_unit_step); + + Window win_input = window; + win_input.set(Window::DimX, dim_manual_loop); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = win_input; + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set(Window::DimX, dim_manual_loop); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if (has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto const base_weights_ptr = weights_it.ptr(); + uint32_t x = run_info.x_start; + + for (; x < run_info.x_leftover_start; x += run_info.x_step) + { + VectorType acc = zero_vector; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for (uint32_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (uint32_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) + : zero_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + acc = wrapper::vmla(acc, weights_vals, input_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if (has_biases) + { + const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc = wrapper::vadd(acc, biases_vals); + } + + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); + } + + for (; x < run_info.x_end; ++x) + { + auto acc_scalar = T{0}; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? *reinterpret_cast<T *>(input_it.ptr() + + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) + : 0; + const auto weights_vals = + *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc_scalar += (input_vals * weights_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if (has_biases) + { + const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc_scalar += biases_vals; + } + *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T> +void depthwise_loop_generic_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + const Window &window, + bool has_biases) +{ + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if (has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector<T> acc(depth_multiplier, static_cast<T>(0)); + + const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), + run_info.input_max_offset))) + : T(0); + + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + } + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if (has_biases) + { + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + } + } + else + { + for (size_t m = 0; m < depth_multiplier; ++m) + { + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + } + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void run_depthwise_float(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + PadStrideInfo conv_info = info.pad_stride_info; + unsigned int depth_multiplier = info.depth_multiplier; + Size2D dilation = info.dilation; + + if (depth_multiplier == 1) + { + depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases); + } + else + { + depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, + has_biases); + } +} + +template <typename T, typename TW> +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); + +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..d32847c1e8 --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info); +} + +void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..682fad0bda --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); +} + +void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) +{ + return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h new file mode 100644 index 0000000000..cf80608f4f --- /dev/null +++ b/src/cpu/kernels/depthwiseconv2d/list.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H +#define SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \ + void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \ + const Window &window, bool has_biases, const ConvolutionInfo &info) +DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative); +DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative); +DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative); +DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp32_deptwiseconv2dnative); +DECLARE_DEPTHWISECONV2D_KERNEL(neon_qp8_qu8_deptwiseconv2dnative); +DECLARE_DEPTHWISECONV2D_KERNEL(neon_qp8_qs8_deptwiseconv2dnative); +#undef DECLARE_DEPTHWISECONV2D_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_DEPTHWISECONV2D_LIST_H diff --git a/src/cpu/kernels/dequantize/generic/neon/fp16.cpp b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp new file mode 100644 index 0000000000..caffdf53e1 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/fp16.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/dequantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + run_dequantization_core<float16_t>(input, output, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/dequantize/generic/neon/fp32.cpp b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp new file mode 100644 index 0000000000..58e987b450 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/fp32.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/dequantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + run_dequantization_core<float>(input, output, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/dequantize/generic/neon/impl.h b/src/cpu/kernels/dequantize/generic/neon/impl.h new file mode 100644 index 0000000000..7197d4dff6 --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/impl.h @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/dequantize/generic/neon/list.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ + +template <typename T> +inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename T> +inline void store_result(T *ptr, const float32x4x2_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename TOut, typename TIn> +void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + const int32_t offset = qinfo.offset; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale, offset); + + store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto val = *(in_ptr + x); + *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale[id.z()]); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4], + scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9], + scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13], + scale[x + 14], scale[x + 15]}}; + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, vscale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize_int16(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int16_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + switch (input->info()->data_type()) + { + case DataType::QASYMM8: + run_dequantization_qasymm8<T, uint8_t>(input, output, window); + break; + case DataType::QASYMM8_SIGNED: + run_dequantization_qasymm8<T, int8_t>(input, output, window); + break; + case DataType::QSYMM8_PER_CHANNEL: + input->info()->data_layout() == DataLayout::NHWC + ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) + : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); + break; + case DataType::QSYMM8: + run_dequantization_qsymm8<T>(input, output, window); + break; + case DataType::QSYMM16: + run_dequantization_qsymm16<T>(input, output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/dequantize/generic/neon/list.h b/src/cpu/kernels/dequantize/generic/neon/list.h new file mode 100644 index 0000000000..678eb2c01a --- /dev/null +++ b/src/cpu/kernels/dequantize/generic/neon/list.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_DEQUANTIZE_KERNEL(func_name) void func_name(const ITensor *input, ITensor *output, const Window &window) + +DECLARE_DEQUANTIZE_KERNEL(fp32_run_dequantization_core); +DECLARE_DEQUANTIZE_KERNEL(fp16_run_dequantization_core); + +#undef DECLARE_DEQUANTIZE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DEQUANTIZE_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/directconv2d/impl.h b/src/cpu/kernels/directconv2d/impl.h new file mode 100644 index 0000000000..d3965326cd --- /dev/null +++ b/src/cpu/kernels/directconv2d/impl.h @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T, bool has_pads> +void linearize_volume_nchw(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int top_left_x, + int top_left_y, + int kernel_width, + int kernel_height, + int kernel_depth, + int input_w, + int input_h, + int input_stride_x, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int kernel_size2 = kernel_width * kernel_height; + const int x_e = top_left_x + kernel_width * dilation_x; + const int y_e = top_left_y + kernel_height * dilation_y; + + // Linearize volume + int d = 0; + // This for loop linearize a volume with 3 slices. This allows: + // 1) to reduce the iterations of the outer for loop "d" + // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs + for (; d <= (kernel_depth - 3); d += 3) + { + for (int y = top_left_y; y < y_e; y += dilation_y) + { + if ((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + } + else + { + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if ((x < 0 || x >= input_w) && has_pads) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + else + { + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + out_ptr += 2 * kernel_size2; + } + + // Left over + for (; d < kernel_depth; d++) + { + for (int y = top_left_y; y < y_e; y += dilation_y) + { + if ((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T)); + out_ptr += kernel_width; + } + else + { + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if ((x < 0 || x >= input_w) && has_pads) + { + *out_ptr = pad_value; + } + else + { + *out_ptr = *(reinterpret_cast<const T *>( + in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + } + + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads> +void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * input_c; + const int element_size = static_cast<int>(sizeof(T)); + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == input_c * element_size)) + { + for (int y = start_y; y < end_y; y += dilation_y) + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + else + { + for (int y = start_y; y < end_y; y += dilation_y) + { + if (y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) + { + for (int x = start_x; x < end_x; x += dilation_x) + { + if (x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size); + out_ptr += input_c; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + input_c * element_size); + out_ptr += input_c; + } + } + } + else + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + } + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads> +void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y, + int pad_right) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * (input_c + pad_right); + const int element_size = static_cast<int>(sizeof(T)); + const int channel_chunk_size = input_c * element_size; + + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == channel_chunk_size)) + { + for (int y = start_y; y < end_y; y += dilation_y) + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for (int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + for (int y = start_y; y < end_y; y += dilation_y) + { + if (y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) + { + for (int x = start_x; x < end_x; x += dilation_x) + { + if (x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size); + out_ptr += input_c + pad_right; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for (int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), + channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + } + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads, bool is_nchw> +void run_im2col(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + const int input_w = src->info()->dimension(width_idx); + const int input_h = src->info()->dimension(height_idx); + const int input_c = src->info()->dimension(channel_idx); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int pad_left = conv_info.pad_left(); + const int pad_top = conv_info.pad_top(); + const int stride_x = conv_info.stride().first; + const int stride_y = conv_info.stride().second; + const int pad_value = + is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; + + const auto kernel_width = kernel_dims.width; + const auto kernel_height = kernel_dims.height; + + Window window_in_out(window); + // The first three dimensions of the input and output are increased by the inner loops + window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(src, window_in_out); + Iterator out(dst, window_in_out); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int start_w = id[width_idx] * stride_x - pad_left; + const int start_h = id[height_idx] * stride_y - pad_top; + + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = + reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * convolved_dims.first) * + dst->info()->strides_in_bytes().y()); + + // Linearize volume + if (is_nchw) + { + linearize_volume_nchw<T, has_pads>( + input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w, + input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y()); + } + else + { + if (input_pad_right > 0) + { + linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, + kernel_height, input_w, input_h, input_c, input_stride_y, + input_stride_z, pad_value, dilation.x(), dilation.y(), + input_pad_right); + } + else + { + linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, + kernel_height, input_w, input_h, input_c, input_stride_y, + input_stride_z, pad_value, dilation.x(), dilation.y()); + } + } + }, + in, out); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h new file mode 100644 index 0000000000..e3ff46b148 --- /dev/null +++ b/src/cpu/kernels/directconv2d/list.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/common/Registrars.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \ + const PadStrideInfo &conv_info) + +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d); + +#define DECLARE_IM2COL_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const Window &window, DataLayout data_layout, \ + const PadStrideInfo &conv_info, std::pair<unsigned int, unsigned int> convolved_dims, \ + const Size2D &kernel_dims, const Size2D &dilation, uint32_t input_pad_right, bool has_bias) + +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_nopad); + +DECLARE_IM2COL_KERNEL(run_im2col_fp32_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_pad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nopad); + +#undef DECLARE_DIRECT_CONV2D_KERNEL +#undef DECLARE_IM2COL_KERNEL + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp new file mode 100644 index 0000000000..84f5eeff5a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/all.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/list.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void neon_fp32_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw<float>(window, src, weights, dst, conv_info); +} + +void run_im2col_fp32_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, true, true>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_fp32_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, false, true>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +#if defined(ARM_COMPUTE_ENABLE_BF16) +void run_im2col_bf16_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_bf16_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nchw/fp16.cpp b/src/cpu/kernels/directconv2d/nchw/fp16.cpp new file mode 100644 index 0000000000..a9cab42f56 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/fp16.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +void neon_fp16_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw<float16_t>(window, src, weights, dst, conv_info); +} +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +void run_im2col_fp16_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +void run_im2col_fp16_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nchw/impl.h b/src/cpu/kernels/directconv2d/nchw/impl.h new file mode 100644 index 0000000000..6a5b175d98 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/impl.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T> +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(conv_info); + + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size; + const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size; + const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + + const int input_dim_w = src->info()->dimension(0); + const int input_dim_h = src->info()->dimension(1); + + const int output_stride_c = dst->info()->strides_in_bytes()[2]; + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size; + + const int kernel_dim_w = weights->info()->dimension(0); + const int kernel_dim_h = weights->info()->dimension(1); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(2); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + T out_temp = static_cast<T>(0); + + for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) + { + const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; + const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; + const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; + int index_w = in_w_start; + int index_wei_w = wei_w_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_w <= ((in_w_end - num_elems_read_per_iteration)); + index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_w < in_w_end; ++index_w, ++index_wei_w) + { + const auto src_val = *(in_ptr_row + index_w * input_stride_w); + const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp new file mode 100644 index 0000000000..f78601544f --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void run_im2col_fp16_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +void run_im2col_fp16_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp new file mode 100644 index 0000000000..17d9212248 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void neon_fp32_nhwc_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nhwc<float>(window, src, weights, dst, conv_info); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp new file mode 100644 index 0000000000..f235167e28 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) +{ + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && + weights->padding().right == 0); +} +} // namespace + +template <typename T> +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + + const int output_stride_c = dst->info()->strides_in_bytes().x(); + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + // nhwc optimized + if (have_zero_x_internal_padding(src->info(), weights->info())) + { + // This function assumes that input and weights have not padding in channel + + /* + * This implementation parallelize the full WC plane of input and weights by + * treating them as series of elements. So for example, a 3x3 weights and + * floating point vector operations of 4 elements per time, the first 3 + * channel elements of the first row would be taken and additionally the first + * element of the second row. The 9 elements in each single WC weight plane + * would require 2 4-element vector operations and a last single element operation. + * + * This works since when we create the input vector to multiply with the weights, + * the exact required elements are loaded in the same order. Therefore the + * multiplication works on the correct input/weight elements. + */ + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + /* + * In here we create theoretical indexes which then we validate for both + * inputs and weights. + * As a reminder, this loop take each output point in NHW, C is treated + * in the weights loop. + */ + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* + * This is the loop in the weights, and it goes along N (the batches) + * As a reminder, the batches of the weights are translated into the + * channels of the output + */ + const T *in_ptr_row = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = + reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_h = index_h_start; index_h < index_h_end; + ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_wc <= index_wc_end - num_elems_read_per_iteration; + index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); + } + else // nhwc non optimized + { + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_c <= index_c_end - num_elems_read_per_iteration; + index_c += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration, + weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); + } +} + +template void convolve_nhwc<float>( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h new file mode 100644 index 0000000000..efb9ce8e2a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H +#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H + +#include "arm_compute/core/ITensor.h" + +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T> +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp new file mode 100644 index 0000000000..4c6fbec63a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void run_im2col_qasymm8_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..9b4375f17c --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window); +} + +template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window); +} + +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..53ccd89dcc --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window); +} + +template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window); +} +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h new file mode 100644 index 0000000000..78e3baf74b --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -0,0 +1,1316 @@ +/* + * Copyright (c) 2021-2022, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H + +#include "src/core/NEON/NEAsymm.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op, typename VectorType> +typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) +{ + using vec_type = typename VectorType::type; + using scalar_type = typename VectorType::scalar_type; + using tag_type = typename VectorType::tag_type; + + vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + + switch (op) + { + case ArithmeticOperation::MAX: + res = wrapper::vmax(a, b); + break; + case ArithmeticOperation::MIN: + res = wrapper::vmin(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const vec_type tmp = wrapper::vsub(a, b); + res = wrapper::vmul(tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + const vec_type tmp = wrapper::vmul(a, b); + const auto gt = wrapper::vcgt(a, zero); + + res = wrapper::vbsl(gt, a, tmp); + break; + } + + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, + const ScalarType &broadcast_value, + const bool reorder) +{ + using tag_type = typename VectorType::tag_type; + using vec_type = typename VectorType::type; + + vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); + return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op( + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)( + int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_value, output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch (op) + { + case ArithmeticOperation::MAX: + res = std::max(a, b); + break; + case ArithmeticOperation::MIN: + res = std::min(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + res = (a - b) * (a - b); + break; + } + case ArithmeticOperation::PRELU: + { + res = (a > 0 ? a : a * b); + break; + } + case ArithmeticOperation::DIV: + { + res = a / b; + break; + } + case ArithmeticOperation::POWER: + { + res = std::pow(a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <> +inline int32x4_t +elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, + const int32x4_t &b) +{ + int32x4_t result; + + // Neon(TM) does not have vector integer division + result[0] = a[0] / b[0]; + result[1] = a[1] / b[1]; + result[2] = a[2] / b[2]; + result[3] = a[3] / b[3]; + + return result; +} + +template <> +inline float32x4_t +elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, + const float32x4_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float32x4_t +elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, + const float32x4_t &b) +{ + return wrapper::vpow(a, b); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>( + const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float16x8_t +elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>( + const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vpow(a, b); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, + ScalarType *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b)); + } + return x; +} + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, + const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, + elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); + } + return x; +} + +template <ArithmeticOperation op, typename VectorType> +void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using scalar_type = typename VectorType::scalar_type; + + elementwise_op<scalar_type, scalar_type, VectorType>( + in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>, + &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, + &elementwise_arithm_op_loop<op, scalar_type, VectorType>); +} + +template <ComparisonOperation op, typename InputScalarType> +inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) +{ + bool res = false; + + switch (op) + { + case ComparisonOperation::Equal: + res = (a == b); + break; + case ComparisonOperation::NotEqual: + res = (a != b); + break; + case ComparisonOperation::Greater: + res = (a > b); + break; + case ComparisonOperation::GreaterEqual: + res = (a >= b); + break; + case ComparisonOperation::Less: + res = (a < b); + break; + case ComparisonOperation::LessEqual: + res = (a <= b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0); +} + +template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) +{ + OutputVectorType res = {0, 0, 0, 0}; + + switch (op) + { + case ComparisonOperation::Equal: + res = wrapper::vceq(a, b); + break; + case ComparisonOperation::NotEqual: + res = wrapper::vnot(wrapper::vceq(a, b)); + break; + case ComparisonOperation::Greater: + res = wrapper::vcgt(a, b); + break; + case ComparisonOperation::GreaterEqual: + res = wrapper::vcge(a, b); + break; + case ComparisonOperation::Less: + res = wrapper::vcgt(b, a); + break; + case ComparisonOperation::LessEqual: + res = wrapper::vcge(b, a); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType +elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +{ + InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, a); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); + } + if (x <= window_end_x - 4) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for (int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(a, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b); + wrapper::vstore(output_ptr + x, res); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto a = wrapper::vloadq(input1_ptr + x); + auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + a = wrapper::vloadq(input1_ptr + x + 4); + b = wrapper::vloadq(input2_ptr + x + 4); + const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); + } + if (x <= window_end_x - 4) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + for (int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(res, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); +} + +inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; + return out; +} + +inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + }}; + return out; +} + +inline void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) +{ + const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); + const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +inline void +store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; + store_quantized(output_ptr, out); +} + +inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +{ + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_s8(output_ptr, vcombine_s8(pa, pb)); +} + +inline void store_quantized_signed(int8_t *output_ptr, + const float32x4x4_t &rf, + const float32x4_t &offset, + const float32x4_t &invscale) +{ + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; + store_quantized_signed(output_ptr, out); +} + +template <ArithmeticOperation op> +inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo); +} + +template <ArithmeticOperation op> +inline int8_t +elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo); +} + +template <ArithmeticOperation op> +float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + using neon_vector_float = wrapper::traits::neon_vector<float, 4>; + float32x4x4_t out = {{ + elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]), + elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]), + elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]), + elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]), + }}; + return out; +} + +template <ComparisonOperation op> +inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + ARM_COMPUTE_UNUSED(qinfo); + return elementwise_comp_op_scalar<op>(a, b); +} + +template <ComparisonOperation op> +inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}}; + return out; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + int8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + int8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) +{ + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +inline void elementwise_op_quantized(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const uint8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const uint8_t *, + const uint8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +inline void +elementwise_comp_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +inline void +elementwise_op_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + int8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + int8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if (is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op> +void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>, + &elementwise_arithm_op_quantized_broadcast_loop<op>, + &elementwise_arithm_op_quantized_loop<op>); +} + +template <ArithmeticOperation op> +void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>, + &elementwise_arithm_op_quantized_signed_broadcast_loop<op>, + &elementwise_arithm_op_quantized_singed_loop<op>); +} + +template <ComparisonOperation op> +void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>, + &elementwise_comp_op_quantized_broadcast_loop<op>, + &elementwise_comp_op_quantized_loop<op>); +} + +template <ComparisonOperation op> +void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>, + &elementwise_comp_op_quantized_signed_broadcast_loop<op>, + &elementwise_comp_op_quantized_signed_loop<op>); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_ELEMENTWISE_BINARY_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp new file mode 100644 index 0000000000..09ad13d5eb --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window); +} + +template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ArithmeticOperation op> +void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window); +} +template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window); +} +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window); +} +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window); +} +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..d891f70644 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized<op>(in1, in2, out, window); +} + +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) +{ + return elementwise_comp_op_quantized<op>(in1, in2, out, window); +} + +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..b1f8e018f5 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window); +} + +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) +{ + return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window); +} + +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..600c7f1c05 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window); +} + +template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<float16_t>(in1, in2, out, op, window); +} + +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..832a966883 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window); +} + +template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<float>(in1, in2, out, op, window); +} +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp new file mode 100644 index 0000000000..fa48407e9b --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" + +#include "src/core/NEON/SVEMath.h" + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename ScalarType> +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +{ + using VectorType = typename sve_vector<ScalarType>::type; + + const auto all_true_pg = svptrue<ScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); + + int x = window_start_x; + + svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + VectorType res{}; + + if (is_broadcast_input_2) + { + res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, + broadcast_vector, op); + } + else + { + res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(pg, output_ptr + x, res); + + x += svcnt<ScalarType>(); + pg = svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op); + svst1(pg, output_ptr + x, res); + + x += svcnt<ScalarType>(); + pg = svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +template void elementwise_arithmetic_op<float32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<float16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<int16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<int32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); + +template <typename InputScalarType, typename OutputScalarType> +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); + + using OutputVectorType = typename sve_vector<OutputScalarType>::type; + const auto all_true_pg = svptrue<InputScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + OutputVectorType res{}; + if (is_broadcast_input_2) + { + res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>( + pg, non_broadcast_vector, broadcast_vector, op); + } + else + { + res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(output_pg, output_ptr + x, res); + + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = + elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, output_ptr + x, res); + + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template void elementwise_comparison_op<float32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<float16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<uint8_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<int16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<int32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); + +template <> +svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b) +{ + ARM_COMPUTE_UNUSED(pg, a, b); + ARM_COMPUTE_ERROR("Not supported"); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h new file mode 100644 index 0000000000..4c61b9f315 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/NEON/wrapper/svtraits.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename VectorType> +VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svpow_z(pg, a, b); +} + +template <typename VectorType> +VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svdiv_z(pg, a, b); +} + +template <uint32_t bytewidth> +svbool_t narrow_to_byte_predicate(svbool_t pg) +{ + const auto all_false = svpfalse(); + + switch (bytewidth) + { + case 8: + pg = svuzp1_b32(pg, all_false); + /* fall through */ + case 4: + pg = svuzp1_b16(pg, all_false); + /* fall through */ + case 2: + pg = svuzp1_b8(pg, all_false); + /* fall through */ + default: + break; + } + return pg; +} + +template <typename VectorType> +VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) +{ + using ScalarType = typename wrapper::sve_scalar<VectorType>::type; + VectorType res{}; + + switch (op) + { + case ArithmeticOperation::MAX: + res = svmax_z(pg, a, b); + break; + case ArithmeticOperation::MIN: + res = svmin_z(pg, a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const auto tmp = svsub_z(pg, a, b); + res = svmul_z(pg, tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const auto zero = svdup_n(ScalarType(0)); + const auto tmp = svmul_z(pg, a, b); + const auto gt = svcmpgt(pg, a, zero); + res = svsel(gt, a, tmp); + break; + } + case ArithmeticOperation::DIV: + { + res = elementwise_div(pg, a, b); + break; + } + case ArithmeticOperation::POWER: + { + res = elementwise_pow(pg, a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <typename InputVectorType, typename OutputVectorType> +OutputVectorType +elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +{ + svbool_t selection_vector{}; + + switch (op) + { + case ComparisonOperation::Equal: + selection_vector = svcmpeq(pg, a, b); + break; + case ComparisonOperation::NotEqual: + selection_vector = svcmpne(pg, a, b); + break; + case ComparisonOperation::Greater: + selection_vector = svcmpgt(pg, a, b); + break; + case ComparisonOperation::GreaterEqual: + selection_vector = svcmpge(pg, a, b); + break; + case ComparisonOperation::Less: + selection_vector = svcmplt(pg, a, b); + break; + case ComparisonOperation::LessEqual: + selection_vector = svcmple(pg, a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type; + selection_vector = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector); + + using OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type; + const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0)); + const auto true_vector = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0)); + auto ret = svsel(selection_vector, true_vector, false_vector); + + return ret; +} + +template <typename ScalarType> +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window); + +template <typename ScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp new file mode 100644 index 0000000000..f7714ff7e9 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window); +} +template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ArithmeticOperation op> +void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window); +} +template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window); +} +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<int16_t>(in1, in2, out, op, window); +} +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_comparison_op<int32_t>(in1, in2, out, op, window); +} +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h new file mode 100644 index 0000000000..7c6015d379 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H + +#include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); +} + +inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + //vprint(x); + + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); +} + +inline void +store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + svst1(pg, ptr, narrowed); +} + +inline void +store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + + svst1(pg, ptr, narrowed); +} + +template <typename ScalarType> +void elementwise_arithmetic_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); + const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svfloat32x4_t result{}; + + if (!is_broadcast_input_2) + { + result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); + + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + + const auto result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template <typename InputScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); + + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + const auto all_true_pg = wrapper::svptrue<InputScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svuint8x4_t result{}; + + if (!is_broadcast_input_2) + { + result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), + svget4(in1, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), + svget4(in1, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), + svget4(in1, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>( + pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>( + pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); + + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + const auto result = + svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), + svget4(in2, 3), op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp new file mode 100644 index 0000000000..5cc66642d7 --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window); +} + +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) +{ + return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window); +} + +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp new file mode 100644 index 0000000000..165e0c05fa --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" +namespace arm_compute +{ +namespace cpu +{ +template <ArithmeticOperation op> +void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window); +} + +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +template <ComparisonOperation op> +void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) +{ + return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window); +} + +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/list.h b/src/cpu/kernels/elementwise_binary/list.h new file mode 100644 index 0000000000..78a098e7bb --- /dev/null +++ b/src/cpu/kernels/elementwise_binary/list.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_BINARY_KERNEL(func_name) \ + template <ArithmeticOperation op> \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_fp32_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s16_elementwise_binary); +DECLARE_ELEMETWISE_BINARY_KERNEL(neon_s32_elementwise_binary); + +#undef DECLARE_ELEMETWISE_BINARY_KERNEL + +#define DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(func_name) \ + template <ComparisonOperation op> \ + void func_name(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) + +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(sve2_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_qasymm8_signed_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_u8_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s16_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_s32_comparison_elementwise_binary); +DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL(neon_fp32_comparison_elementwise_binary); +#undef DECLARE_COPMP_ELEMETWISE_BINARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_BINARY_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp new file mode 100644 index 0000000000..2588db024d --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_op<__fp16>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp new file mode 100644 index 0000000000..936a2e588a --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_op<float>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h new file mode 100644 index 0000000000..d54d3984cb --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2018-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) +{ + switch (op) + { + case ElementWiseUnary::RSQRT: + return 1 / sqrt(a); + case ElementWiseUnary::EXP: + return std::exp(a); + case ElementWiseUnary::NEG: + return -a; + case ElementWiseUnary::LOG: + return std::log(a); + case ElementWiseUnary::ABS: + return std::abs(a); + case ElementWiseUnary::ROUND: + return support::cpp11::nearbyint(a); + case ElementWiseUnary::SIN: + return std::sin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <typename ScalarType, typename VectorType> +inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) +{ + switch (op) + { + case ElementWiseUnary::RSQRT: + return wrapper::vinvsqrt(a); + case ElementWiseUnary::EXP: + return wrapper::vexpq(a); + case ElementWiseUnary::NEG: + return wrapper::vneg(a); + case ElementWiseUnary::LOG: + return wrapper::vlog(a); + case ElementWiseUnary::ABS: + return wrapper::vabs(a); + case ElementWiseUnary::ROUND: + return wrapper::vround(a); + case ElementWiseUnary::SIN: + return wrapper::vsin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <typename ScalarType> +inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); +} + +template <> +inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const UniformQuantizationInfo qi_in = in->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = out->info()->quantization_info().uniform(); + const auto min_clamped_value = vdupq_n_f32((-128 - qi_out.offset) * qi_out.scale); + const auto max_clamped_value = vdupq_n_f32((127 - qi_out.offset) * qi_out.scale); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int8x16_t vout; + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto vconst_0_f32 = vdupq_n_f32(0); + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + + // Perform activation + float32x4x4_t vtmp_deq = {{ + elementwise_op_imp<float>(op, vin_deq.val[0]), + elementwise_op_imp<float>(op, vin_deq.val[1]), + elementwise_op_imp<float>(op, vin_deq.val[2]), + elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); + } + + // Re-quantize to new output space + vout = vquantize_signed(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); + } + for (; x < window_end_x; ++x) + { + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + if (tmp_f <= 0.0) + { + if (op == ElementWiseUnary::LOG) + { + tmp_f = (-128 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (127 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } + tmp = quantize_qasymm8_signed( + tmp_f, qi_out, + RoundingPolicy:: + TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. + // For aarch64 LUT is used and rounding to nearest is used + *(output_ptr + x) = tmp; + } + }, + input, output); +} +template <> +inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const UniformQuantizationInfo qi_in = in->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = out->info()->quantization_info().uniform(); + const auto vconst_0_f32 = vdupq_n_f32(0); + const auto min_clamped_value = vdupq_n_f32((0 - qi_out.offset) * qi_out.scale); + const auto max_clamped_value = vdupq_n_f32((255 - qi_out.offset) * qi_out.scale); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + uint8x16_t vout; + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; + auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + + // Perform activation + float32x4x4_t vtmp_deq = {{ + elementwise_op_imp<float>(op, vin_deq.val[0]), + elementwise_op_imp<float>(op, vin_deq.val[1]), + elementwise_op_imp<float>(op, vin_deq.val[2]), + elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); + } + + // Re-quantize to new output space + vout = vquantize(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); + } + for (; x < window_end_x; ++x) + { + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + float tmp_f = dequantize_qasymm8(in, qi_in); + if (tmp_f <= 0.0) + { + if (op == ElementWiseUnary::LOG) + { + tmp_f = (0 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (255 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } + tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); + *(output_ptr + x) = tmp; + } + }, + input, output); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp new file mode 100644 index 0000000000..d4daad4ca6 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_op<int32_t>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp new file mode 100644 index 0000000000..38cb61d0ff --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/lut/list.h" + +namespace arm_compute +{ +namespace cpu +{ + +#ifdef __aarch64__ + +void neon_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(op); + + auto win = window; + const auto window_end_x = window.x().end(); + win.set(0, Window::Dimension(0, 1, 1)); + + Iterator src_it(in, win); + Iterator dst_it(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); + + lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); +} + +#endif // __aarch64__ + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..3e4b88eb47 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Window.h" + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#ifndef __aarch64__ +// Fallback function to be used for armv7a, for aarch64 LUT is used +void neon_qasymm8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_op<uint8_t>(in, out, window, op); +} +#endif // #ifndef __aarch64__ + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..a5f4b053e3 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Window.h" + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#ifndef __aarch64__ +// Fallback function to be used for armv7a, for aarch64 LUT is used +void neon_qasymm8_signed_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_op<int8_t>(in, out, window, op); +} +#endif // #ifndef __aarch64__ + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp new file mode 100644 index 0000000000..22ff43c5d9 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_sve_op<float16_t>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp new file mode 100644 index 0000000000..394bd47adf --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_sve_op<float32_t>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp new file mode 100644 index 0000000000..5af534d9e7 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch (op) + { + case ElementWiseUnary::RSQRT: + return svinvsqrt(pg, a); + case ElementWiseUnary::EXP: + return wrapper::svexp_z(pg, a); + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::LOG: + return wrapper::svlog_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + case ElementWiseUnary::ROUND: + return svrintn_z(pg, a); + case ElementWiseUnary::SIN: + return wrapper::svsin_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch (op) + { + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType> +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input, output); +} + +template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.h b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h new file mode 100644 index 0000000000..f2068dc63f --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp new file mode 100644 index 0000000000..e27fe5a87f --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(lut); + return elementwise_sve_op<int32_t>(in, out, window, op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp new file mode 100644 index 0000000000..4e4582debb --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/lut/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void sve2_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +{ + ARM_COMPUTE_UNUSED(op); + + auto win = window; + const auto window_end_x = window.x().end(); + win.set(0, Window::Dimension(0, 1, 1)); + + Iterator src_it(in, win); + Iterator dst_it(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); + + lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/list.h b/src/cpu/kernels/elementwise_unary/list.h new file mode 100644 index 0000000000..a9701afdd8 --- /dev/null +++ b/src/cpu/kernels/elementwise_unary/list.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H +#define SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H + +#include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" +#include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ELEMETWISE_UNARY_KERNEL(func_name) \ + void func_name(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) + +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve_s32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(sve2_q8_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_fp16_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_s32_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_q8_elementwise_unary); +#ifndef __aarch64__ +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_signed_elementwise_unary); +DECLARE_ELEMETWISE_UNARY_KERNEL(neon_qasymm8_elementwise_unary); +#endif // __aarch64__ +#undef DECLARE_ELEMETWISE_UNARY_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_KERNELS_ELEMETWISE_UNARY_LIST_H diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h new file mode 100644 index 0000000000..5ac78df324 --- /dev/null +++ b/src/cpu/kernels/floor/list.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H +#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len) + +DECLARE_FLOOR_KERNEL(fp16_neon_floor); +DECLARE_FLOOR_KERNEL(fp32_neon_floor); + +#undef DECLARE_FLOOR_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */ diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp new file mode 100644 index 0000000000..f47690277d --- /dev/null +++ b/src/cpu/kernels/floor/neon/fp16.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 8; + +void fp16_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast<const __fp16 *>(src); + auto pdst = static_cast<__fp16 *>(dst); + + for (; len >= step; len -= step) + { + vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); + psrc += step; + pdst += step; + } + + for (; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++psrc; + ++pdst; + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp new file mode 100644 index 0000000000..a86e24d3c3 --- /dev/null +++ b/src/cpu/kernels/floor/neon/fp32.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 4; + +void fp32_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast<const float *>(src); + auto pdst = static_cast<float *>(dst); + + for (; len >= step; len -= step) + { + vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); + psrc += step; + pdst += step; + } + + for (; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++pdst; + ++psrc; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp new file mode 100644 index 0000000000..8f47ecba8f --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fused_batch_normalization_conv_f16(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} + +void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp new file mode 100644 index 0000000000..3ca5b6977a --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fused_batch_normalization_conv_f32(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h new file mode 100644 index 0000000000..0c90abccb1 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H +#define ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T, bool fused_activation, typename F> +void batch_normalization_nchw(const Window &window, + ITensor *in, + ITensor *out, + const ITensor *in_mean, + const ITensor *in_var, + const ITensor *in_beta, + const ITensor *in_gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_to_use = window; + win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win_to_use); + Iterator output(out, win_to_use); + + F activation_functor(act_info); + + // Hold information about the current feature map we are iterating. + // Only compute denominator and constants once per feature map. + int slice = -1; + + const auto input_mean = reinterpret_cast<const T *>(in_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const T *>(in_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (in_gamma != nullptr) ? reinterpret_cast<const T *>(in_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (in_beta != nullptr) ? reinterpret_cast<const T *>(in_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + + T mean = static_cast<T>(0); + T var = static_cast<T>(0); + T gamma = static_cast<T>(1); + T beta = static_cast<T>(0); + T denominator = static_cast<T>(0); + + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto var_vec = wrapper::vdup_n(var, ExactTagType{}); + auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); + const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{}); + execute_window_loop( + win_to_use, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + if (slice != id.z()) + { + mean = input_mean[id.z()]; + var = input_var[id.z()]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + if (input_gamma != nullptr) + { + gamma = input_gamma[id.z()]; + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + } + if (input_beta != nullptr) + { + beta = input_beta[id.z()]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Calculate denominator + denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + denominator = wrapper::vgetlane(denominator_vec, 0); + slice = id.z(); + } + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator_vec); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T numerator = input_ptr[x] - mean; + const T x_bar = numerator * denominator; + T res = beta + x_bar * gamma; + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + *(output_ptr + x) = res; + } + }, + input, output); +} + +template <typename T> +void fused_batch_normalization_conv(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + using ScalarType = T; + const int size = 16 / conv_weights->info()->element_size(); + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == conv_weights); + const bool run_in_place_bias = (fused_bias == nullptr) || (conv_bias != nullptr && fused_bias == conv_bias); + + // Set build options + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = size; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Iterator conv_w_in(conv_weights, win); + Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win); + + const auto conv_bias_in = + (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto conv_bias_out = + (run_in_place_bias ? conv_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + + const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; + + auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto gamma_vec = wrapper::vdup_n(ScalarType(1), ExactTagType{}); + auto beta_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto rvar_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{}); + + auto mean = ScalarType(0.0); + auto var = ScalarType(0.0); + auto gamma = ScalarType(1.0); + auto beta = ScalarType(0.0); + auto conv_bias_in_scalar = ScalarType(0.0); + execute_window_loop( + win, + [&](const Coordinates &id) + { + var = input_var[id[3]]; + if (input_gamma != nullptr) + { + gamma = input_gamma[id[3]]; + } + + if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0)) + { + if (input_beta != nullptr) + { + beta = input_beta[id[3]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Construct vectors + mean = input_mean[id[3]]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + + if (conv_bias_in != nullptr) + { + conv_bias_in_scalar = conv_bias_in[id[3]]; + } + auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta; + } + + int x = window_start_x; + auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr()); + auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto wn = wrapper::vloadq(conv_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); + + // Store results + wrapper::vstore(conv_w_out_ptr + x, wn); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + conv_w_in, conv_w_out); +} +template <typename T> +void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + using ScalarType = T; + const int size = 16 / dwc_weights->info()->element_size(); + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights); + const bool run_in_place_bias = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias); + + // Set build options + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = size; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Iterator dwc_w_in(dwc_weights, win); + Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); + + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + + const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; + + auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto gamma_vec = wrapper::vdup_n(ScalarType(1), ExactTagType{}); + auto beta_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto rvar_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{}); + + auto mean = ScalarType(0.0); + auto var = ScalarType(0.0); + auto gamma = ScalarType(1.0); + auto beta = ScalarType(0.0); + auto dwc_bias_in_scalar = ScalarType(0.0); + execute_window_loop( + win, + [&](const Coordinates &id) + { + var = input_var[id[2]]; + if (input_gamma != nullptr) + { + gamma = input_gamma[id[2]]; + } + + if (id[1] == 0) + { + mean = input_mean[id[2]]; + + // Construct vectors + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + if (input_beta != nullptr) + { + beta = input_beta[id[2]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[id[2]]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta; + } + + int x = window_start_x; + auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); + + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); +} + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h new file mode 100644 index 0000000000..a03dd74f78 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/list.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_LIST_H +#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name) \ + void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \ + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) + +#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name) \ + void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) + +#define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name) \ + void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) + +DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16); +DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32); +DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(fused_batch_normalization_dwc_nhwc_f16); +DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(fused_batch_normalization_dwc_nhwc_f32); +DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_nchw_f16); +DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_nchw_f32); + +#undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL +#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL +#undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif // diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp new file mode 100644 index 0000000000..25580e1bec --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp new file mode 100644 index 0000000000..ae4c7e5736 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp16.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_batch_normalization_nchw_non_fused(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_relu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_brelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} + +void fp16_batch_normalization_nchw_non_fused_lubrelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>(window, input, output, mean, var, beta, + gamma, epsilon, act_info); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp new file mode 100644 index 0000000000..ae2db1ac66 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/neon/fp32.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_batch_normalization_nchw_non_fused(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float, false, detail::dummy<float, 4>>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_relu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float, true, detail::relu<float, 4>>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_brelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float, true, detail::brelu<float, 4>>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} + +void fp32_batch_normalization_nchw_non_fused_lubrelu(const Window &window, + ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) +{ + batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>(window, input, output, mean, var, beta, gamma, + epsilon, act_info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp new file mode 100644 index 0000000000..1d88d3b494 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" +#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} + +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp new file mode 100644 index 0000000000..1f336bb196 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/fuse_batch_normalization/generic/impl.h" +#include "src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h new file mode 100644 index 0000000000..5b74a7aef6 --- /dev/null +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2018-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H +#define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) +{ + using ScalarType = T; + const int size = 16 / dwc_weights->info()->element_size(); + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const bool run_in_place_weights = (fused_weights == nullptr) || (fused_weights == dwc_weights); + const bool run_in_place_bias = (fused_bias == nullptr) || (dwc_bias != nullptr && fused_bias == dwc_bias); + + // Set build options + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = size; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Iterator dwc_w_in(dwc_weights, win); + Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); + + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + + const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; + + auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto gamma_vec = wrapper::vdup_n(ScalarType(1), ExactTagType{}); + auto beta_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto rvar_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + auto dwc_bias_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); + const auto epsilon_vec = wrapper::vdup_n(ScalarType(epsilon), ExactTagType{}); + + auto gamma = ScalarType(1.0); + auto beta = ScalarType(0.0); + auto dwc_bias_in_scalar = ScalarType(0); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + var_vec = wrapper::vloadq(input_var + x); + if (input_gamma != nullptr) + { + gamma_vec = wrapper::vloadq(input_gamma + x); + } + + if ((id[2] == 0) && (id[1] == 0)) + { + mean_vec = wrapper::vloadq(input_mean + x); + + // Construct vectors + if (input_beta != nullptr) + { + beta_vec = wrapper::vloadq(input_beta + x); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x); + } + + auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), + wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec))); + dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec); + wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec); + } + + auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); + + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto var = input_var[x]; + if (input_gamma != nullptr) + { + gamma = input_gamma[x]; + } + + if (id[2] == 0 && id[1] == 0) + { + auto mean = input_mean[x]; + if (input_beta != nullptr) + { + beta = input_beta[x]; + } + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[x]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta; + } + + const auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); +} +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp new file mode 100644 index 0000000000..4d7507a5da --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const float16x8_t beta_f16 = vdupq_n_f16(beta); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window.collapse_if_possible(window, Window::DimZ); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); + + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); + const float16x8x2_t c = vld2q_f16(in_ptr + x); + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); + alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); + + vst2q_f16(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta); + } + }, + in, out); +} +} // namespace +void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + return matrix_addition_f16(src, dst, window, beta); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp new file mode 100644 index 0000000000..fa3f4de11f --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp32.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + return matrix_addition_f32(src, dst, window, beta); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp new file mode 100644 index 0000000000..47de0f3928 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const float32x4_t beta_f32 = vdupq_n_f32(beta); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window.collapse_if_possible(window, Window::DimZ); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const float *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); + + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); + const float32x4x4_t c = vld4q_f32(in_ptr + x); + + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); + alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); + alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); + alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); + + vst4q_f32(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * beta; + } + }, + in, out); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h new file mode 100644 index 0000000000..26ac99b483 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H +#define SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta); + +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_KERNELS_GEMMMATRIXADD_IMPL_H diff --git a/src/cpu/kernels/gemm_matrix_add/list.h b/src/cpu/kernels/gemm_matrix_add/list.h new file mode 100644 index 0000000000..415b4c8321 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_add/list.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H +#define SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_GEMMMATRIXADD_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const Window &window, float beta) +DECLARE_GEMMMATRIXADD_KERNEL(neon_fp32_gemm_matrix_add); +DECLARE_GEMMMATRIXADD_KERNEL(neon_fp16_gemm_matrix_add); +#undef DECLARE_GEMMMATRIXADD_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_GEMMMATRIXADD_LIST_H diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp new file mode 100644 index 0000000000..60fda511e3 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#include "src/core/utils/helpers/float_ops.h" +#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +void vector_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size()); + const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0)); + + // The implementation computes 32 elements per iteration + const int window_start_x = 32 * info.thread_id; + const int window_step_x = 32 * info.num_threads; + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, + " (window_end_x - window_start_x) must be multiple of window_step_x"); + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, win_out); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float16x8_t alpha_f16 = vdupq_n_f16(alpha); + + execute_window_loop( + win_out, + [&](const Coordinates &) + { + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + if (x > width_matrix_b) + { + return; + } + + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + + float16x8_t acc0 = vdupq_n_f16(0.f); + float16x8_t acc1 = vdupq_n_f16(0.f); + float16x8_t acc2 = vdupq_n_f16(0.f); + float16x8_t acc3 = vdupq_n_f16(0.f); + + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + const float16x4_t a0l = vld1_f16(vec_a); + + float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); + + matrix_b += 2 * in_b_stride; + + b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); + + vec_a += 4; + matrix_b += 2 * in_b_stride; + } + + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); + acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); + acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); + acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f16(acc0, alpha_f16); + acc1 = vmulq_f16(acc1, alpha_f16); + acc2 = vmulq_f16(acc2, alpha_f16); + acc3 = vmulq_f16(acc3, alpha_f16); + } + + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + + vst1q_f16(vec_out + 0, acc0); + vst1q_f16(vec_out + 8, acc1); + vst1q_f16(vec_out + 16, acc2); + vst1q_f16(vec_out + 24, acc3); + } + + for (; x < window_end_x; ++x) + { + if (x > width_matrix_b) + { + return; + } + + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + + float16x4_t vacc = vdup_n_f16(0.f); + + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) + { + const float16x4_t a0l = vld1_f16(vec_a); + + const float16x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; + + vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); + + matrix_b += 4 * in_b_stride; + } + + float16_t acc = + vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); + + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16_t b00 = *matrix_b; + + acc += b00 * a0; + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= static_cast<float16_t>(alpha); + } + + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + + *(vec_out) = acc; + } + }, + ina, inb, out); +} + +void matrix_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + ARM_COMPUTE_UNUSED(info); + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const int num_elems_matrix_b_x = rhs->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, window); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float16x8_t alpha_f16 = vdupq_n_f16(alpha); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr()); + const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr()); + auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr()); + float16x8x4_t c = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}}; + + /* + This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + |a00 a01 a02 a03 | a04 a05 a06 a07| + |a10 a11 a12 a13 | a14 a15 a16 a17| + |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ... + |a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ... + |a40 a41 a42 a43 | a44 a45 a46 a47| + |a50 a51 a52 a53 | a54 a55 a56 a57| + |a60 a61 a62 a63 | a64 a65 a66 a67| + |a70 a71 a72 a73 | a74 a75 a76 a77| + + After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ] + + B Matrix has been transposed as shown below + + |b00 b01 b02 b03 b04 b05 b06 b07| + |b10 b11 b12 b13 b14 b15 b16 b17| + |b20 b21 b22 b23 b24 b25 b26 b27| + |b30 b31 b32 b33 b34 b35 b36 b37| + -------------------> + + |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37| + + c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30 + c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31 + + The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. + */ + const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) + + { + const float16x8_t p00 = vld1q_f16(mtx_a0); + const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); + + const float16x8_t q00 = vld1q_f16(mtx_b0); + const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); + const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); + const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); + + mtx_a0 += 16; + mtx_b0 += 32; + } + + for (; mtx_b0 < mtx_b0_end_addr;) + + { + const float16x4_t p00 = vld1_f16(mtx_a0); + const float16x8_t q00 = vld1q_f16(mtx_b0); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); + + mtx_a0 += 4; + mtx_b0 += 8; + } + + if (multiply_alpha) + { + c.val[0] = vmulq_f16(c.val[0], alpha_f16); + c.val[1] = vmulq_f16(c.val[1], alpha_f16); + c.val[2] = vmulq_f16(c.val[2], alpha_f16); + c.val[3] = vmulq_f16(c.val[3], alpha_f16); + } + + if (id.x() < (out_width - 8)) + { + vst1q_f16(mtx_out, c.val[0]); + if (id.y() + 1 < out_height) + { + vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); + if (id.y() + 2 < out_height) + { + vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); + if (id.y() + 3 < out_height) + { + vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + } + } + } + } + else + { + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) + { + *(mtx_out + x) = c.val[0][x]; + if (id.y() + 1 < out_height) + { + *(mtx_out + x + 1 * out_stride) = c.val[1][x]; + if (id.y() + 2 < out_height) + { + *(mtx_out + x + 2 * out_stride) = c.val[2][x]; + if (id.y() + 3 < out_height) + { + *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + } + } + } + } + } + }, + ina, inb, out); +} + +void neon_fp16_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) +{ + return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha); +} +} // namespace cpu +} // namespace arm_compute +#endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp new file mode 100644 index 0000000000..e12a312280 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) +{ + return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp new file mode 100644 index 0000000000..404d070a37 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp @@ -0,0 +1,656 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h" + +#include "src/core/utils/helpers/float_ops.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = + static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); + const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0)); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, win_out); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + + execute_window_loop( + win_out, + [&](const Coordinates &) + { + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + if (x > width_matrix_b) + { + return; + } + + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); +#endif /* __arm__ */ + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); + + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); +#endif /* __arm__ */ + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + + a0l = vld1_f32(vec_a); + + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + } + + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; + + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f32(acc0, alpha_f32); + acc1 = vmulq_f32(acc1, alpha_f32); + acc2 = vmulq_f32(acc2, alpha_f32); + acc3 = vmulq_f32(acc3, alpha_f32); + } + + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + if (x > width_matrix_b) + { + return; + } + + float32x4_t vacc = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); +#endif /* __arm__ */ + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) + { + const float32x4_t a0l = vld1q_f32(vec_a); + + const float32x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); +#endif /* __arm__ */ + + vacc = vmlaq_f32(vacc, b_col, a0l); + + matrix_b += 4 * in_b_stride; + } + + float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + + vgetq_lane_f32(vacc, 3); + + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; + + const float b00 = *matrix_b; + + acc += b00 * a0; + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= alpha; + } + + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + + *vec_out = acc; + } + }, + ina, inb, out); +} + +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + ARM_COMPUTE_UNUSED(info); + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; + const int num_elems_matrix_b_x = rhs->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if (rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix + // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4 + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, window); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr()); + auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; + + float32x4_t acc00 = vdupq_n_f32(0.f); + float32x4_t acc10 = vdupq_n_f32(0.f); + float32x4_t acc20 = vdupq_n_f32(0.f); + float32x4_t acc30 = vdupq_n_f32(0.f); + + float32x4_t acc01 = vdupq_n_f32(0.f); + float32x4_t acc11 = vdupq_n_f32(0.f); + float32x4_t acc21 = vdupq_n_f32(0.f); + float32x4_t acc31 = vdupq_n_f32(0.f); + +#if __arm__ + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + for (; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc00 = vmulq_f32(acc00, alpha_f32); + acc10 = vmulq_f32(acc10, alpha_f32); + acc20 = vmulq_f32(acc20, alpha_f32); + acc30 = vmulq_f32(acc30, alpha_f32); + acc01 = vmulq_f32(acc01, alpha_f32); + acc11 = vmulq_f32(acc11, alpha_f32); + acc21 = vmulq_f32(acc21, alpha_f32); + acc31 = vmulq_f32(acc31, alpha_f32); + } + + const auto mtx_out0 = reinterpret_cast<float *>(out.ptr()); + const auto mtx_out1 = mtx_out0 + 4; + + if (id.x() < (out_width - 8)) + { + vst1q_f32(mtx_out0, acc00); + vst1q_f32(mtx_out1, acc01); + if (id.y() + 1 < out_height) + { + vst1q_f32(mtx_out0 + out_stride1, acc10); + vst1q_f32(mtx_out1 + out_stride1, acc11); + if (id.y() + 2 < out_height) + { + vst1q_f32(mtx_out0 + out_stride2, acc20); + vst1q_f32(mtx_out1 + out_stride2, acc21); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out1 + out_stride3, acc31); + } + } + } + } + else if (id.x() < (out_width - 4)) + { + vst1q_f32(mtx_out0, acc00); + if (id.y() + 1 < out_height) + { + vst1q_f32(mtx_out0 + out_stride1, acc10); + if (id.y() + 2 < out_height) + { + vst1q_f32(mtx_out0 + out_stride2, acc20); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + } + } + } + // Left-over columns + const int columns_left = out_width - id.x() - 4; + for (auto x = 0; x < columns_left; ++x) + { + *(mtx_out1 + x) = acc01[x]; + if (id.y() + 1 < out_height) + { + *(mtx_out1 + x + out_stride1) = acc11[x]; + if (id.y() + 2 < out_height) + { + *(mtx_out1 + x + out_stride2) = acc21[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out1 + x + out_stride3) = acc31[x]; + } + } + } + } + } + else + { + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) + { + *(mtx_out0 + x) = acc00[x]; + if (id.y() + 1 < out_height) + { + *(mtx_out0 + x + out_stride1) = acc10[x]; + if (id.y() + 2 < out_height) + { + *(mtx_out0 + x + out_stride2) = acc20[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out0 + x + out_stride3) = acc30[x]; + } + } + } + } + } + }, + ina, inb, out); +} +} // namespace cpu + +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h new file mode 100644 index 0000000000..74ea4c2b17 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H +#define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H +#include "arm_compute/core/Helpers.h" + +#include "src/core/CPP/Validate.h" + +namespace arm_compute +{ +namespace cpu +{ +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); + +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); + +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h new file mode 100644 index 0000000000..15b23b1d81 --- /dev/null +++ b/src/cpu/kernels/gemm_matrix_mul/list.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H +#define SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \ + void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \ + float alpha, const bool is_dst_vector) +DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul); +DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul); +#undef DECLARE_GEMMMATRIXMUL_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_GEMMMATRIXMUL_LIST_H diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp new file mode 100644 index 0000000000..4ed7e54f1c --- /dev/null +++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/genproposals/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) +{ + return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window); +} +} // namespace cpu +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp new file mode 100644 index 0000000000..f15cd63bb2 --- /dev/null +++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/genproposals/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) +{ + return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp new file mode 100644 index 0000000000..8cb76f3afb --- /dev/null +++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/genproposals/generic/neon/impl.h" +namespace arm_compute +{ +class ITensor; +class Window; +namespace cpu +{ +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) +{ + Iterator all_anchors_it(all_anchors, window); + Iterator anchors_it(all_anchors, window); + + const size_t num_anchors = anchors->info()->dimension(1); + const float stride = 1.f / anchors_info.spatial_scale(); + const size_t feat_width = anchors_info.feat_width(); + + const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; + + const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + + const size_t shift_idy = id.y() / num_anchors; + const float shiftx = (shift_idy % feat_width) * stride; + const float shifty = (shift_idy / feat_width) * stride; + + const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; + const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; + const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; + const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; + + *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); + *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); + *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); + *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); + }, + all_anchors_it); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h new file mode 100644 index 0000000000..3317bcfbe6 --- /dev/null +++ b/src/cpu/kernels/genproposals/generic/neon/impl.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H +#define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/wrapper/wrapper.h" +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void compute_all_anchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) +{ + Iterator all_anchors_it(all_anchors, window); + Iterator anchors_it(all_anchors, window); + + const size_t num_anchors = anchors->info()->dimension(1); + const T stride = 1.f / anchors_info.spatial_scale(); + const size_t feat_width = anchors_info.feat_width(); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; + + const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + + const size_t shift_idy = id.y() / num_anchors; + const T shiftx = (shift_idy % feat_width) * stride; + const T shifty = (shift_idy / feat_width) * stride; + + *out_anchor_ptr = *anchor_ptr + shiftx; + *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; + *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; + *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; + }, + all_anchors_it); +} + +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp new file mode 100644 index 0000000000..7182d0b27d --- /dev/null +++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/genproposals/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qu16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) +{ + return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/list.h b/src/cpu/kernels/genproposals/list.h new file mode 100644 index 0000000000..570c686e89 --- /dev/null +++ b/src/cpu/kernels/genproposals/list.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H +#define SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(func_name) \ + void func_name(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) + +DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_qu16_computeallanchors); +DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp16_computeallanchors); +DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL(neon_fp32_computeallanchors); + +#undef DECLARE_NEGENERATEPROPOSALSLAYERKERNEL_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CORE_NEON_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_LIST_H */ diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp new file mode 100644 index 0000000000..44418c0bb9 --- /dev/null +++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/instancenorm/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +template <typename InputType, typename AccType> +void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputType &inputs) +{ + result = wrapper::vadd(result, inputs); + result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs)); +} + +template <typename InputType, typename AccType> +InputType vector_float_norm_fp16(const InputType &inputs, + const AccType &vec_mean, + const AccType &vec_multip, + const AccType &vec_beta) +{ + return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); +} + +template <> +inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs) +{ + vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs))); + vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs))); +} +template <> +inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, + const float32x4_t &vec_mean, + const float32x4_t &vec_multip, + const float32x4_t &vec_beta) +{ + const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs)); + const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs)); + const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta)); + const auto result_high = + wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta)); + float16x8_t result = wrapper::vcombine(result_low, result_high); + + return result; +} + +template <typename AccType> +void instance_normalization_nchw_fp16( + const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(float16_t); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) + { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<AccType>(0.f); + auto sum_squares_h_w = static_cast<AccType>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast<AccType>(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = + vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast<AccType>(*(input_ptr + x)); + *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} +} // namespace + +void neon_fp16_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) +{ + if (use_mixed_precision) + { + return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window); + } + else + { + return instance_normalization_nchw_fp16<float16_t>(input, output, gamma, beta, epsilon, window); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp new file mode 100644 index 0000000000..e1ca05518d --- /dev/null +++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/instancenorm/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) +{ + ARM_COMPUTE_UNUSED(use_mixed_precision); + return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp new file mode 100644 index 0000000000..515079e1b5 --- /dev/null +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/instancenorm/generic/neon/impl.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +class ITensor; +class Window; +namespace cpu +{ +template <typename InputType, typename AccType> +void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs) +{ + result = wrapper::vadd(result, inputs); + result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs)); +} + +template <typename InputType, typename AccType> +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +{ + return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); +} + +template <typename T, typename AccType> +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Clear X/Y dimensions on execution window as we handle the planes manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); + + Iterator input_it(input, win); + execute_window_loop( + win, + [&](const Coordinates &id) + { + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<AccType>(0.f); + auto sum_squares_h_w = static_cast<AccType>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast<AccType>(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast<AccType>(*(input_ptr + x)); + *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); + }, + input_it); +} + +template void instance_normalization_nchw<float>( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h new file mode 100644 index 0000000000..e1cc7487f7 --- /dev/null +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H +#define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H +#include "arm_compute/core/Helpers.h" + +#include "arm_neon.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T, typename AccType = T> +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); + +template <typename InputType, typename AccType = InputType> +void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs); + +template <typename InputType, typename AccType = InputType> +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta); +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h new file mode 100644 index 0000000000..51b496c41d --- /dev/null +++ b/src/cpu/kernels/instancenorm/list.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H +#define SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_INSTANCENORM_KERNEL(func_name) \ + void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \ + const Window &window) +DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm); +DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm); +#undef DECLARE_INSTANCENORM_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_INSTANCENORM_LIST_H diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp new file mode 100644 index 0000000000..296fe88791 --- /dev/null +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include "src/core/utils/AssemblyUtils.h" + +#include "depthwise_common.hpp" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +constexpr unsigned int idx_width = 1; +constexpr unsigned int idx_height = 2; +constexpr unsigned int idx_channels = 0; +constexpr unsigned int idx_batches = 3; + +template <typename TSrc, typename TWeights, typename TDst> +void create_arm_dwc(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, + std::string &_name) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + unsigned int dilation_cols = info.dilation.x(); + unsigned int dilation_rows = info.dilation.y(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); + + // Configure assembly pooling kernel + auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args); + if (dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + _name = dwc_kernel_asm->name(); + kernel = std::move(dwc_kernel_asm); +} + +template <typename TSrc, typename TWeights, typename TDst> +void create_arm_dwc_quant(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, + std::vector<int32_t> &multipliers, + std::vector<int32_t> &right_shifts, + std::vector<int32_t> &left_shifts, + std::string &_name) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + unsigned int dilation_cols = info.dilation.x(); + unsigned int dilation_rows = info.dilation.y(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto weights_qinfo = weights->quantization_info(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + const unsigned int num_filters = weights_qinfo.scale().size(); + + multipliers.resize(num_filters); + std::vector<int32_t> dst_shifts(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data()); + + // Quantize activation bounds + int32_t min_activation = std::numeric_limits<TSrc>::lowest(); + int32_t max_activation = std::numeric_limits<TSrc>::max(); + if (info.act_info.enabled()) + { + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); + } + + // Set quantization parameters for assembly kernels + arm_gemm::Requantize32 requant_args{}; + if (is_data_type_quantized_per_channel(weights->data_type())) + { + left_shifts.resize(num_filters); + right_shifts.resize(num_filters); + bool need_left_shift = false; // Select more optimized path if left shift is not needed + for (unsigned int i = 0; i < num_filters; ++i) + { + left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0)); + right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0)); + if (dst_shifts[i] < 0 && !need_left_shift) + { + need_left_shift = true; + } + } + + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr, + right_shifts.data(), multipliers.data(), + static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation)); + } + else + { + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, -dst_shifts[0], multipliers[0], + static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation)); + } + + // Configure assembly pooling kernel with requantization + auto dwc_kernel_asm = + arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args); + if (dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + _name = dwc_kernel_asm->name(); + kernel = std::move(dwc_kernel_asm); +} +} // namespace + +CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() + : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name() +{ +} + +CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; + +void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info) +{ + ARM_COMPUTE_UNUSED(cpu_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Destination initialization if not yet initialized + const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + _name = "CpuDepthwiseConv2dAssemblyWrapperKernel"; + std::string asm_kernel_name(""); +#if defined(__aarch64__) + switch (src->data_type()) + { + case DataType::QASYMM8: + if (is_data_type_quantized_per_channel(weights->data_type())) + { + create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); + } + else + { + create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); + } + break; + case DataType::QASYMM8_SIGNED: + create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, + _right_shifts, _left_shifts, asm_kernel_name); + break; +#if defined(ENABLE_FP16_KERNELS) + case DataType::F16: + create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, + asm_kernel_name); + break; +#endif // defined(ENABLE_FP16_KERNELS) + case DataType::F32: + create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name); + break; + default: + break; + } +#endif // defined(__aarch64__) + + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); + if (_kernel_asm != nullptr) + { + _name += "/" + asm_kernel_name; + } +} + +Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + +#if !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); +#endif // !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, + "Only NHWC is supported by assembly kernels"); + + if (is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); + + if (is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + } + } + + if (dst->total_size() > 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + // Assembly kernels cannot work with padding greater than the kernel. + const auto &padding = info.pad_stride_info; + const auto &dilation = info.dilation; + const auto &wei_shape = weights->tensor_shape(); + + const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1); + const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1); + + ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w || + padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h); + + return Status{}; +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); + + const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); + + const auto src_shape = src->info()->tensor_shape(); + const auto dst_shape = dst->info()->tensor_shape(); + const auto src_padding = src->info()->padding(); + const auto dst_padding = dst->info()->padding(); + + const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; + const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); + const size_t ld_src_batch = ld_src_row * src_shape[2]; + const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; + const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); + const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; + + _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row, + ld_dst_batch, working_space, info.thread_id, info.num_threads); +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) +{ + _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const +{ + return _kernel_asm->get_storage_size(); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const +{ + return _kernel_asm->get_working_size(num_threads); +} + +bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const +{ + return _kernel_asm != nullptr; +} + +const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const +{ + return _name.c_str(); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h new file mode 100644 index 0000000000..fadaefb999 --- /dev/null +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H + +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/kernels/CpuKernelSelectionTypes.h" + +namespace arm_conv +{ +namespace depthwise +{ +// Forward declarations +class IDepthwiseCommon; +} // namespace depthwise +} // namespace arm_conv + +namespace arm_compute +{ +struct ConvolutionInfo; + +namespace cpu +{ +namespace kernels +{ +/** This class is a wrapper for the depthwise convolution assembly kernels. */ +class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel<CpuDepthwiseConv2dAssemblyWrapperKernel> +{ +public: + /** Default constructor */ + CpuDepthwiseConv2dAssemblyWrapperKernel(); + ~CpuDepthwiseConv2dAssemblyWrapperKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel); + + /** Initialise the kernel's src and dst. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p input. + * @param[in] info Depthwise convolution layer meta-data. + * @param[in] cpu_info CPU information needed to select the most appropriate kernel. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure() + * + * @return a status. + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Pack bias and weights in a storage space for the assembly kernel + * + * @param[in] parameters_ptr Pointer to storage space. + * @param[in] bias_ptr Pointer to bias buffer. + * @param[in] weights_ptr Pointer to weights buffer. + * @param[in] ld_weights_col Columns displacement for the weights tensor. + * @param[in] ld_weights_row Rows displacement for the weights tensor. + */ + void pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); + + /** Get the amount of storage space required for the rearranged weights and bias. + * + * @return size of workspace + */ + size_t get_storage_size() const; + + /** Get size of the workspace needed by the assembly kernel. + * + * @param[in] num_threads Maximum number of threads that are going to be spawned. + * + * @return size of workspace + */ + size_t get_working_size(unsigned int num_threads) const; + + /** Was the asm kernel successfully configured? + * + * @return True if the asm kernel is configured and ready to run + */ + bool is_configured() const; + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requsted configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; + +private: + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> _kernel_asm; + std::vector<int32_t> _multipliers{}; + std::vector<int32_t> _left_shifts{}; + std::vector<int32_t> _right_shifts{}; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp new file mode 100644 index 0000000000..2c1cb15786 --- /dev/null +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) +{ + ARM_COMPUTE_UNUSED(cpu_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // dst initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info))); + +#if defined(__aarch64__) + const bool requantize = src->quantization_info() != dst->quantization_info(); + + switch (src->data_type()) + { + case DataType::QASYMM8: + if (requantize) + { + create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info); + } + else + { + create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info); + } + break; + case DataType::QASYMM8_SIGNED: + if (requantize) + { + create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info); + } + else + { + create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info); + } + break; +#if defined(ENABLE_FP16_KERNELS) + case DataType::F16: + create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info); + break; +#endif // defined(ENABLE_FP16_KERNELS) + case DataType::F32: + create_arm_pooling<float, float>(src, dst, info, cpu_info); + break; + default: + break; + } +#endif // defined(__aarch64__) + + Window win = calculate_max_window(*dst, Steps()); + INEKernel::configure(win); +} + +Status +CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); +#endif /* __aarch64__ */ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), + "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX), + "Only AVG and MAX pooling are supported by assembly kernels"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(info), + "Pooling region that is entirely outside input tensor is unsupported by assembly kernels"); + + if (dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + const TensorInfo out_info(compute_pool_shape(*src, info), 1, dst->data_type()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + const auto src_qinfo = src->quantization_info().uniform(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + if (src_qinfo != dst_qinfo) + { + const float multiplier = src_qinfo.scale / dst_qinfo.scale; + int32_t dst_multiplier{}; + int32_t dst_shift{}; + ARM_COMPUTE_RETURN_ERROR_ON( + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); + } + else + { + if (src->data_type() == DataType::QASYMM8) + { + const bool has_padding = info.pad_stride_info.has_padding(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + } + } + } + else + { + if (src->data_type() == DataType::QASYMM8) + { + // If dst is not configured, the quantization info are the same + const bool has_padding = info.pad_stride_info.has_padding(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + } + } + return Status{}; +} + +void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); + + const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = + (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + + const auto src_shape = src->info()->tensor_shape(); + const auto dst_shape = dst->info()->tensor_shape(); + const auto src_padding = src->info()->padding(); + const auto dst_padding = dst->info()->padding(); + + const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; + const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); + const size_t ld_src_batch = ld_src_row * src_shape[2]; + const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; + const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); + const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; + + _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + working_space, info.thread_id, info.num_threads); +} + +size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const +{ + return _kernel_asm->get_working_size(num_threads); +} + +bool CpuPool2dAssemblyWrapperKernel::is_configured() const +{ + return _kernel_asm != nullptr; +} + +template <typename Typesrc, typename Typedst> +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) +{ + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; + + arm_conv::pooling::PoolingWindow window{}; + window.cols = static_cast<unsigned int>(info.pool_size.x()); + window.rows = static_cast<unsigned int>(info.pool_size.y()); + + arm_conv::pooling::PoolingStride stride{}; + std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); + + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; + + constexpr unsigned int idx_width = 1; + constexpr unsigned int idx_height = 2; + constexpr unsigned int idx_channels = 0; + constexpr unsigned int idx_batches = 3; + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + + // Configure assembly pooling kernel + auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args); + if (pooling_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + _kernel_asm = std::move(pooling_kernel_asm); +} + +template <typename Typesrc, typename Typedst> +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) +{ + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; + + arm_conv::pooling::PoolingWindow window{}; + window.cols = static_cast<unsigned int>(info.pool_size.x()); + window.rows = static_cast<unsigned int>(info.pool_size.y()); + + arm_conv::pooling::PoolingStride stride{}; + std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); + + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; + + constexpr unsigned int idx_width = 1; + constexpr unsigned int idx_height = 2; + constexpr unsigned int idx_channels = 0; + constexpr unsigned int idx_batches = 3; + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + const float multiplier = src_qinfo.scale / dst_qinfo.scale; + int32_t dst_multiplier{}; + int32_t dst_shift{}; + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); + + const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset, + dst_shift, // left shift + 0, // right shift + dst_multiplier); + + // Configure assembly pooling kernel with requantization + auto pooling_kernel_asm = + arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args); + if (pooling_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + _kernel_asm = std::move(pooling_kernel_asm); +} + +size_t CpuPool2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const +{ + ARM_COMPUTE_UNUSED(thread_count); + ARM_COMPUTE_UNUSED(platform); + + return ICPPKernel::default_mws; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h new file mode 100644 index 0000000000..b4ff1e6f2d --- /dev/null +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H +#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H + +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" +#include "src/core/NEON/kernels/assembly/pooling.hpp" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/kernels/CpuKernelSelectionTypes.h" + +#include "pool_common.hpp" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific + * CPUs like A53 or A55. The arm compute library creates an instance of + * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to + * execute a single assembly kernel in the context of an NEFunction. + * + */ +class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel<CpuPool2dAssemblyWrapperKernel> +{ +public: + /** Constructor + */ + CpuPool2dAssemblyWrapperKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel); + + const char *name() const override + { + return "CpuPool2dAssemblyWrapperKernel"; + } + + /** Initialise the kernel's src and dst. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src. + * @param[in] info Pooling meta-data. + * @param[in] cpu_info CPU information needed to select the most appropriate kernel. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2dAssemblyWrapperKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + /** Get size of the workspace needed by the assembly kernel. + * + * @param[in] num_threads Maximum number of threads that are going to be spawned. + * + * @return size of workspace + */ + size_t get_working_size(unsigned int num_threads) const; + + /** Was the asm kernel successfully configured? + * + * @return True if the asm kernel is configured and ready to run + */ + bool is_configured() const; + +private: + /** Helper function to create the assembly kernel. + * + * @param[in] src Source tensor info. + * @param[in] dst Destination tensor info. + * @param[in] info Pooling layer meta-data. + */ + template <typename Typesrc, typename Typedst> + void + create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + + /** Helper function to create the assembly kernel with requantization support + * + * @param[in] src Source tensor info. + * @param[in] dst Destination tensor info. + * @param[in] info Pooling layer meta-data. + */ + template <typename Typesrc, typename Typedst> + void create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info); + + std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr}; + + /** Return minimum workload size of the relevant kernel + * + * @param[in] platform The CPU platform used to create the context. + * @param[in] thread_count Number of threads in the execution. + * + * @return[out] small_network_mws Minimum workload size for requsted configuration. + */ + size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp new file mode 100644 index 0000000000..6c6527de06 --- /dev/null +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +{ + ARM_COMPUTE_UNUSED(unused_axis); + return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window); +} + +void neon_fp16_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +{ + return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp new file mode 100644 index 0000000000..520877068c --- /dev/null +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +{ + ARM_COMPUTE_UNUSED(unused_axis); + return l2_normalize_x<float, 4>(in, sum, out, epsilon, window); +} + +void neon_fp32_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +{ + return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h new file mode 100644 index 0000000000..6bd19299b7 --- /dev/null +++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H +#define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +template <typename T, int S> +void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window) +{ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input_it(in, win_collapsed); + Iterator sum_it(sum, win_collapsed); + Iterator output_it(out, win_collapsed); + + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + + const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr()); + const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon))); + const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); +} + +template <typename T, int S> +void l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +{ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + const int window_step_x = 16 / data_size_from_type(in->info()->data_type()); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window window_sum(win); + window_sum.set(axis, Window::Dimension(0, 0, 0)); + + Iterator input_it(in, win); + Iterator sum_it(sum, window_sum); + Iterator output_it(out, win); + + const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{}); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); + const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon))); + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); +} +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h new file mode 100644 index 0000000000..e2a879d06e --- /dev/null +++ b/src/cpu/kernels/l2normlayer/list.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H +#define SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_L2NORMLAYER_KERNEL(func_name) \ + void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \ + size_t axis) + +DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x); +DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz); +DECLARE_L2NORMLAYER_KERNEL(neon_fp32_l2_normalize_x); +DECLARE_L2NORMLAYER_KERNEL(neon_fp32_l2_normalize_yz); + +#undef DECLARE_L2NORMLAYER_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_L2NORMLAYER_LIST_H diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp new file mode 100644 index 0000000000..5516f5b33d --- /dev/null +++ b/src/cpu/kernels/lut/generic/neon/u8.cpp @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/lut/list.h" + +namespace arm_compute +{ +namespace cpu +{ + +#ifdef __aarch64__ + +void lut_u8_neon( + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) +{ + __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n" + "ldr q17, [%x[table], #0x10]\n" + "mov x23, #0x0\n" + "ldr q18, [%x[table], #0x20]\n" + "ldr q19, [%x[table], #0x30]\n" + "ldr q20, [%x[table], #0x40]\n" + "ldr q21, [%x[table], #0x50]\n" + "ldr q22, [%x[table], #0x60]\n" + "ldr q23, [%x[table], #0x70]\n" + "ldr q24, [%x[table], #0x80]\n" + "ldr q25, [%x[table], #0x90]\n" + "ldr q26, [%x[table], #0xa0]\n" + "ldr q27, [%x[table], #0xb0]\n" + "ldr q28, [%x[table], #0xc0]\n" + "ldr q29, [%x[table], #0xd0]\n" + "ldr q30, [%x[table], #0xe0]\n" + "ldr q31, [%x[table], #0xf0]\n" + "1:" // string loop + "ldr x22, [%x[input], x23, LSL #0x3]\n" + "ldr x21, [%x[output], x23, LSL #0x3]\n" + "movi v11.16b, #0x40\n" + "movi v10.16b, #0x80\n" + "movi v9.16b, #0xc0\n" + "mov x20, %x[string_length]\n" + "2:" // 4 rounds: width loop + "cmp x20, #0x30\n" + "bge 27f\n" + "tbz x20, #5, 10f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "ld1 { v13.16b }, [x22], #0x10\n" + "tbz x20, #3, 6f\n" + "ldr d12, [x22], #0x8\n" + "tbz x20, #2, 4f\n" + "ld1 { v12.s }[2], [x22], #0x4\n" + "tbz x20, #1, 3f\n" + "ld1 { v12.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[14], [x22]\n" + "b 26f\n" + "3:" // 4 rounds: Partial load: partial_1_44 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[12], [x22]\n" + "b 26f\n" + "4:" // 4 rounds: Partial load: partial_2_40 + "tbz x20, #1, 5f\n" + "ld1 { v12.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[10], [x22]\n" + "b 26f\n" + "5:" // 4 rounds: Partial load: partial_1_40 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[8], [x22]\n" + "b 26f\n" + "6:" // 4 rounds: Partial load: partial_4_32 + "tbz x20, #2, 8f\n" + "ldr s12, [x22], #0x4\n" + "tbz x20, #1, 7f\n" + "ld1 { v12.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[6], [x22]\n" + "b 26f\n" + "7:" // 4 rounds: Partial load: partial_1_36 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[4], [x22]\n" + "b 26f\n" + "8:" // 4 rounds: Partial load: partial_2_32 + "tbz x20, #1, 9f\n" + "ldr h12, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[2], [x22]\n" + "b 26f\n" + "9:" // 4 rounds: Partial load: partial_1_32 + "tbz x20, #0, 26f\n" + "ldr b12, [x22, #0x0]\n" + "b 26f\n" + "10:" // 4 rounds: Partial load: partial_16_0 + "tbz x20, #4, 18f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "tbz x20, #3, 14f\n" + "ldr d13, [x22], #0x8\n" + "tbz x20, #2, 12f\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "tbz x20, #1, 11f\n" + "ld1 { v13.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[14], [x22]\n" + "b 26f\n" + "11:" // 4 rounds: Partial load: partial_1_28 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[12], [x22]\n" + "b 26f\n" + "12:" // 4 rounds: Partial load: partial_2_24 + "tbz x20, #1, 13f\n" + "ld1 { v13.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[10], [x22]\n" + "b 26f\n" + "13:" // 4 rounds: Partial load: partial_1_24 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[8], [x22]\n" + "b 26f\n" + "14:" // 4 rounds: Partial load: partial_4_16 + "tbz x20, #2, 16f\n" + "ldr s13, [x22], #0x4\n" + "tbz x20, #1, 15f\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[6], [x22]\n" + "b 26f\n" + "15:" // 4 rounds: Partial load: partial_1_20 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[4], [x22]\n" + "b 26f\n" + "16:" // 4 rounds: Partial load: partial_2_16 + "tbz x20, #1, 17f\n" + "ldr h13, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[2], [x22]\n" + "b 26f\n" + "17:" // 4 rounds: Partial load: partial_1_16 + "tbz x20, #0, 26f\n" + "ldr b13, [x22, #0x0]\n" + "b 26f\n" + "18:" // 4 rounds: Partial load: partial_8_0 + "tbz x20, #3, 22f\n" + "ldr d8, [x22], #0x8\n" + "tbz x20, #2, 20f\n" + "ld1 { v8.s }[2], [x22], #0x4\n" + "tbz x20, #1, 19f\n" + "ld1 { v8.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[14], [x22]\n" + "b 26f\n" + "19:" // 4 rounds: Partial load: partial_1_12 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[12], [x22]\n" + "b 26f\n" + "20:" // 4 rounds: Partial load: partial_2_8 + "tbz x20, #1, 21f\n" + "ld1 { v8.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[10], [x22]\n" + "b 26f\n" + "21:" // 4 rounds: Partial load: partial_1_8 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[8], [x22]\n" + "b 26f\n" + "22:" // 4 rounds: Partial load: partial_4_0 + "tbz x20, #2, 24f\n" + "ldr s8, [x22], #0x4\n" + "tbz x20, #1, 23f\n" + "ld1 { v8.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[6], [x22]\n" + "b 26f\n" + "23:" // 4 rounds: Partial load: partial_1_4 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[4], [x22]\n" + "b 26f\n" + "24:" // 4 rounds: Partial load: partial_2_0 + "tbz x20, #1, 25f\n" + "ldr h8, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[2], [x22]\n" + "b 26f\n" + "25:" // 4 rounds: Partial load: partial_1_0 + "ldr b8, [x22, #0x0]\n" + "26:" // 4 rounds: Partial load: Done + "b 28f\n" + "27:" // 4 rounds: Full load + "ldr q8, [x22, #0x0]\n" + "ldr q13, [x22, #0x10]\n" + "ldr q12, [x22, #0x20]\n" + "add x22, x22, #0x30\n" + "28:" // 4 rounds: Load done + "sub v0.16b, v8.16b, v11.16b\n" + "sub v7.16b, v8.16b, v10.16b\n" + "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n" + "sub v6.16b, v8.16b, v9.16b\n" + "sub v5.16b, v13.16b, v11.16b\n" + "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n" + "sub v4.16b, v13.16b, v10.16b\n" + "sub v3.16b, v13.16b, v9.16b\n" + "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n" + "sub v2.16b, v12.16b, v11.16b\n" + "sub v1.16b, v12.16b, v10.16b\n" + "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n" + "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n" + "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n" + "orr v8.16b, v8.16b, v0.16b\n" + "sub v0.16b, v12.16b, v9.16b\n" + "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n" + "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n" + "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n" + "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n" + "orr v7.16b, v7.16b, v6.16b\n" + "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n" + "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n" + "orr v13.16b, v13.16b, v5.16b\n" + "orr v4.16b, v4.16b, v3.16b\n" + "orr v12.16b, v12.16b, v2.16b\n" + "cmp x20, #0x30\n" + "orr v1.16b, v1.16b, v0.16b\n" + "orr v8.16b, v8.16b, v7.16b\n" + "orr v13.16b, v13.16b, v4.16b\n" + "orr v12.16b, v12.16b, v1.16b\n" + "bge 53f\n" + "tbz x20, #5, 36f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "st1 { v13.16b }, [x21], #0x10\n" + "tbz x20, #3, 32f\n" + "str d12, [x21], #0x8\n" + "tbz x20, #2, 30f\n" + "st1 { v12.s }[2], [x21], #0x4\n" + "tbz x20, #1, 29f\n" + "st1 { v12.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[14], [x21]\n" + "b 52f\n" + "29:" // 4 rounds: Partial writeback: partial_1_44 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[12], [x21]\n" + "b 52f\n" + "30:" // 4 rounds: Partial writeback: partial_2_40 + "tbz x20, #1, 31f\n" + "st1 { v12.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[10], [x21]\n" + "b 52f\n" + "31:" // 4 rounds: Partial writeback: partial_1_40 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[8], [x21]\n" + "b 52f\n" + "32:" // 4 rounds: Partial writeback: partial_4_32 + "tbz x20, #2, 34f\n" + "str s12, [x21], #0x4\n" + "tbz x20, #1, 33f\n" + "st1 { v12.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[6], [x21]\n" + "b 52f\n" + "33:" // 4 rounds: Partial writeback: partial_1_36 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[4], [x21]\n" + "b 52f\n" + "34:" // 4 rounds: Partial writeback: partial_2_32 + "tbz x20, #1, 35f\n" + "str h12, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[2], [x21]\n" + "b 52f\n" + "35:" // 4 rounds: Partial writeback: partial_1_32 + "tbz x20, #0, 52f\n" + "str b12, [x21, #0x0]\n" + "b 52f\n" + "36:" // 4 rounds: Partial writeback: partial_16_0 + "tbz x20, #4, 44f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "tbz x20, #3, 40f\n" + "str d13, [x21], #0x8\n" + "tbz x20, #2, 38f\n" + "st1 { v13.s }[2], [x21], #0x4\n" + "tbz x20, #1, 37f\n" + "st1 { v13.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[14], [x21]\n" + "b 52f\n" + "37:" // 4 rounds: Partial writeback: partial_1_28 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[12], [x21]\n" + "b 52f\n" + "38:" // 4 rounds: Partial writeback: partial_2_24 + "tbz x20, #1, 39f\n" + "st1 { v13.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[10], [x21]\n" + "b 52f\n" + "39:" // 4 rounds: Partial writeback: partial_1_24 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[8], [x21]\n" + "b 52f\n" + "40:" // 4 rounds: Partial writeback: partial_4_16 + "tbz x20, #2, 42f\n" + "str s13, [x21], #0x4\n" + "tbz x20, #1, 41f\n" + "st1 { v13.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[6], [x21]\n" + "b 52f\n" + "41:" // 4 rounds: Partial writeback: partial_1_20 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[4], [x21]\n" + "b 52f\n" + "42:" // 4 rounds: Partial writeback: partial_2_16 + "tbz x20, #1, 43f\n" + "str h13, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[2], [x21]\n" + "b 52f\n" + "43:" // 4 rounds: Partial writeback: partial_1_16 + "tbz x20, #0, 52f\n" + "str b13, [x21, #0x0]\n" + "b 52f\n" + "44:" // 4 rounds: Partial writeback: partial_8_0 + "tbz x20, #3, 48f\n" + "str d8, [x21], #0x8\n" + "tbz x20, #2, 46f\n" + "st1 { v8.s }[2], [x21], #0x4\n" + "tbz x20, #1, 45f\n" + "st1 { v8.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[14], [x21]\n" + "b 52f\n" + "45:" // 4 rounds: Partial writeback: partial_1_12 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[12], [x21]\n" + "b 52f\n" + "46:" // 4 rounds: Partial writeback: partial_2_8 + "tbz x20, #1, 47f\n" + "st1 { v8.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[10], [x21]\n" + "b 52f\n" + "47:" // 4 rounds: Partial writeback: partial_1_8 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[8], [x21]\n" + "b 52f\n" + "48:" // 4 rounds: Partial writeback: partial_4_0 + "tbz x20, #2, 50f\n" + "str s8, [x21], #0x4\n" + "tbz x20, #1, 49f\n" + "st1 { v8.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[6], [x21]\n" + "b 52f\n" + "49:" // 4 rounds: Partial writeback: partial_1_4 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[4], [x21]\n" + "b 52f\n" + "50:" // 4 rounds: Partial writeback: partial_2_0 + "tbz x20, #1, 51f\n" + "str h8, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[2], [x21]\n" + "b 52f\n" + "51:" // 4 rounds: Partial writeback: partial_1_0 + "str b8, [x21, #0x0]\n" + "52:" // 4 rounds: Partial writeback: Done + "b 54f\n" + "53:" // 4 rounds: Full writeback + "str q8, [x21, #0x0]\n" + "str q13, [x21, #0x10]\n" + "str q12, [x21, #0x20]\n" + "add x21, x21, #0x30\n" + "54:" // 4 rounds: Writeback done + "subs x20, x20, #0x30\n" + "bgt 2b\n" + "add x23, x23, #0x1\n" + "cmp x23, %x[num_strings]\n" + "bne 1b\n" + : + : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), + [string_length] "r"(string_length), [table] "r"(table) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"); +} + +#endif // __aarch64__ + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/lut/generic/sve/u16.cpp b/src/cpu/kernels/lut/generic/sve/u16.cpp new file mode 100644 index 0000000000..75b8dcaae2 --- /dev/null +++ b/src/cpu/kernels/lut/generic/sve/u16.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Error.h" + +#include "src/cpu/kernels/lut/list.h" + +#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SVE + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void lut_u16_sve(const uint16_t *table, size_t num_strings, size_t size, const uint16_t *input, uint16_t *output) +{ + int64_t cnth = svcnth(); + int64_t tail = size & (4 * cnth - 1); + int64_t count = size - tail; + int64_t pos = 0; + ARM_COMPUTE_UNUSED(num_strings); + __asm __volatile("cbz %[count], 2f\n" + "mov z31.s, #0\n" + "cnth x7, ALL, MUL #4\n" + "cntb x8, ALL, MUL #4\n" + "ptrue p0.b\n" + "1:" + "ld1h z0.h, p0/z, [%[input]]\n" + "ld1h z1.h, p0/z, [%[input], #1, MUL VL]\n" + "ld1h z2.h, p0/z, [%[input], #2, MUL VL]\n" + "ld1h z3.h, p0/z, [%[input], #3, MUL VL]\n" + "add %[input], %[input], x8\n" + + "zip1 z8.h, z0.h, z31.h\n" + "ld1h z8.s, p0/z, [%[table], z8.s, UXTW #1]\n" + "zip2 z0.h, z0.h, z31.h\n" + "ld1h z0.s, p0/z, [%[table], z0.s, UXTW #1]\n" + "uzp1 z0.h, z8.h, z0.h\n" + "st1h z0.h, p0, [%[output]]\n" + + "zip1 z10.h, z1.h, z31.h\n" + "ld1h z10.s, p0/z, [%[table], z10.s, UXTW #1]\n" + "zip2 z1.h, z1.h, z31.h\n" + "ld1h z1.s, p0/z, [%[table], z1.s, UXTW #1]\n" + "uzp1 z1.h, z10.h, z1.h\n" + "st1h z1.h, p0, [%[output], #1, MUL VL]\n" + + "zip1 z12.h, z2.h, z31.h\n" + "ld1h z12.s, p0/z, [%[table], z12.s, UXTW #1]\n" + "zip2 z2.h, z2.h, z31.h\n" + "ld1h z2.s, p0/z, [%[table], z2.s, UXTW #1]\n" + "uzp1 z2.h, z12.h, z2.h\n" + "st1h z2.h, p0, [%[output], #2, MUL VL]\n" + + "zip1 z14.h, z3.h, z31.h\n" + "ld1h z14.s, p0/z, [%[table], z14.s, UXTW #1]\n" + "zip2 z3.h, z3.h, z31.h\n" + "ld1h z3.s, p0/z, [%[table], z3.s, UXTW #1]\n" + "uzp1 z3.h, z14.h, z3.h\n" + "st1h z3.h, p0, [%[output], #3, MUL VL]\n" + + "add %[pos], %[pos], x7\n" + "add %[output], %[output], x8\n" + "cmp %[pos], %[count]\n" + "blt 1b\n" + "2:\n" + : [count] "+r"(count), [input] "+r"(input), [output] "+r"(output), [pos] "+r"(pos) + : [table] "r"(table) + : "memory", "cc", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", + "z14", "z31", "p0", "p1", "z2", "z3", "z4", "x7", "x8"); + for (int i = 0; i < tail; i++) + { + output[i] = table[input[i]]; + } +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __aarch64__ diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp new file mode 100644 index 0000000000..ee8572703e --- /dev/null +++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp @@ -0,0 +1,644 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/lut/list.h" + +#ifdef __aarch64__ +#ifdef ARM_COMPUTE_ENABLE_SVE + +namespace arm_compute +{ +namespace cpu +{ +void lut_u8_sve2( + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) +{ + __asm__ __volatile__( + "ptrue p0.b\n" + "cntd x25\n" + "addvl %x[table], %x[table], #8\n" + "ld1b { z16.b }, p0/Z, [%x[table], #-8, MUL VL]\n" + "tbnz x25, #5, 1f\n" + "ld1b { z17.b }, p0/Z, [%x[table], #-7, MUL VL]\n" + "tbnz x25, #4, 1f\n" + "ld1b { z18.b }, p0/Z, [%x[table], #-6, MUL VL]\n" + "ld1b { z19.b }, p0/Z, [%x[table], #-5, MUL VL]\n" + "tbnz x25, #3, 1f\n" + "ld1b { z20.b }, p0/Z, [%x[table], #-4, MUL VL]\n" + "ld1b { z21.b }, p0/Z, [%x[table], #-3, MUL VL]\n" + "ld1b { z22.b }, p0/Z, [%x[table], #-2, MUL VL]\n" + "ld1b { z23.b }, p0/Z, [%x[table], #-1, MUL VL]\n" + "tbnz x25, #2, 1f\n" + "ld1b { z24.b }, p0/Z, [%x[table]]\n" + "ld1b { z25.b }, p0/Z, [%x[table], #1, MUL VL]\n" + "ld1b { z26.b }, p0/Z, [%x[table], #2, MUL VL]\n" + "ld1b { z27.b }, p0/Z, [%x[table], #3, MUL VL]\n" + "ld1b { z28.b }, p0/Z, [%x[table], #4, MUL VL]\n" + "ld1b { z29.b }, p0/Z, [%x[table], #5, MUL VL]\n" + "ld1b { z30.b }, p0/Z, [%x[table], #6, MUL VL]\n" + "ld1b { z31.b }, p0/Z, [%x[table], #7, MUL VL]\n" + "1:" // Table load done + "mov x24, #0x0\n" + "2:" // string loop + "ldr x23, [%x[input], x24, LSL #0x3]\n" + "ldr x22, [%x[output], x24, LSL #0x3]\n" + "tbnz x25, #5, 14f\n" + "tbnz x25, #4, 11f\n" + "tbnz x25, #3, 8f\n" + "tbnz x25, #2, 5f\n" + "mov z12.b, #0x10\n" + "mov x21, %x[string_length]\n" + "ptrue p5.b\n" + "ptrue p4.b\n" + "ptrue p3.b\n" + "ptrue p2.b\n" + "ptrue p1.b\n" + "ptrue p0.b\n" + "3:" // 16 rounds: width loop + "addvl x20, x21, #-6\n" + "cmp x20, XZR\n" + "bge 4f\n" + "mov x20, #0x0\n" + "addvl x20, x20, #1\n" + "whilelt p5.b, XZR, x21\n" + "whilelt p4.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p3.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p2.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p1.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p0.b, x20, x21\n" + "4:" // 16 rounds: predicate OK + "ld1b { z11.b }, p5/Z, [x23]\n" + "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n" + "tbl z9.b, { z16.b }, z11.b\n" + "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n" + "sub z11.b, z11.b, z12.b\n" + "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n" + "tbl z4.b, { z16.b }, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + "tbl z3.b, { z16.b }, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + "tbl z2.b, { z16.b }, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + "tbl z1.b, { z16.b }, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + "tbl z0.b, { z16.b }, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e89 // tbx z9.b, z20.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e84 // tbx z4.b, z20.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e83 // tbx z3.b, z20.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e82 // tbx z2.b, z20.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e81 // tbx z1.b, z20.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e80 // tbx z0.b, z20.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2ea9 // tbx z9.b, z21.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2ea4 // tbx z4.b, z21.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282ea3 // tbx z3.b, z21.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272ea2 // tbx z2.b, z21.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262ea1 // tbx z1.b, z21.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252ea0 // tbx z0.b, z21.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2ec9 // tbx z9.b, z22.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2ec4 // tbx z4.b, z22.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282ec3 // tbx z3.b, z22.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272ec2 // tbx z2.b, z22.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262ec1 // tbx z1.b, z22.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252ec0 // tbx z0.b, z22.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2ee9 // tbx z9.b, z23.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2ee4 // tbx z4.b, z23.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282ee3 // tbx z3.b, z23.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272ee2 // tbx z2.b, z23.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262ee1 // tbx z1.b, z23.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252ee0 // tbx z0.b, z23.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2f09 // tbx z9.b, z24.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2f04 // tbx z4.b, z24.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282f03 // tbx z3.b, z24.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272f02 // tbx z2.b, z24.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262f01 // tbx z1.b, z24.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252f00 // tbx z0.b, z24.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2f29 // tbx z9.b, z25.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2f24 // tbx z4.b, z25.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282f23 // tbx z3.b, z25.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272f22 // tbx z2.b, z25.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262f21 // tbx z1.b, z25.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252f20 // tbx z0.b, z25.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2f49 // tbx z9.b, z26.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2f44 // tbx z4.b, z26.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282f43 // tbx z3.b, z26.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272f42 // tbx z2.b, z26.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262f41 // tbx z1.b, z26.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252f40 // tbx z0.b, z26.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2f69 // tbx z9.b, z27.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2f64 // tbx z4.b, z27.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282f63 // tbx z3.b, z27.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272f62 // tbx z2.b, z27.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262f61 // tbx z1.b, z27.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252f60 // tbx z0.b, z27.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2f89 // tbx z9.b, z28.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2f84 // tbx z4.b, z28.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282f83 // tbx z3.b, z28.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272f82 // tbx z2.b, z28.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262f81 // tbx z1.b, z28.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252f80 // tbx z0.b, z28.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2fa9 // tbx z9.b, z29.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2fa4 // tbx z4.b, z29.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282fa3 // tbx z3.b, z29.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272fa2 // tbx z2.b, z29.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262fa1 // tbx z1.b, z29.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252fa0 // tbx z0.b, z29.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "addvl x21, x21, #-6\n" + ".inst 0x052b2fc9 // tbx z9.b, z30.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2fc4 // tbx z4.b, z30.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282fc3 // tbx z3.b, z30.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272fc2 // tbx z2.b, z30.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262fc1 // tbx z1.b, z30.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252fc0 // tbx z0.b, z30.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "cmp x21, XZR\n" + ".inst 0x052b2fe9 // tbx z9.b, z31.b, z11.b\n" + ".inst 0x052a2fe4 // tbx z4.b, z31.b, z10.b\n" + ".inst 0x05282fe3 // tbx z3.b, z31.b, z8.b\n" + "st1b { z9.b }, p5, [x22]\n" + ".inst 0x05272fe2 // tbx z2.b, z31.b, z7.b\n" + ".inst 0x05262fe1 // tbx z1.b, z31.b, z6.b\n" + "st1b { z4.b }, p4, [x22, #1, MUL VL]\n" + ".inst 0x05252fe0 // tbx z0.b, z31.b, z5.b\n" + "st1b { z3.b }, p3, [x22, #2, MUL VL]\n" + "addvl x23, x23, #6\n" + "st1b { z2.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z1.b }, p1, [x22, #4, MUL VL]\n" + "st1b { z0.b }, p0, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "bgt 3b\n" + "b 17f\n" + "5:" // 256 bits + "mov z12.b, #0x20\n" + "mov x21, %x[string_length]\n" + "ptrue p5.b\n" + "ptrue p4.b\n" + "ptrue p3.b\n" + "ptrue p2.b\n" + "ptrue p1.b\n" + "ptrue p0.b\n" + "6:" // 8 rounds: width loop + "addvl x20, x21, #-6\n" + "cmp x20, XZR\n" + "bge 7f\n" + "mov x20, #0x0\n" + "addvl x20, x20, #1\n" + "whilelt p5.b, XZR, x21\n" + "whilelt p4.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p3.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p2.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p1.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p0.b, x20, x21\n" + "7:" // 8 rounds: predicate OK + "ld1b { z11.b }, p5/Z, [x23]\n" + "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n" + "tbl z9.b, { z16.b }, z11.b\n" + "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n" + "sub z11.b, z11.b, z12.b\n" + "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n" + "tbl z4.b, { z16.b }, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + "tbl z3.b, { z16.b }, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + "tbl z2.b, { z16.b }, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + "tbl z1.b, { z16.b }, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + "tbl z0.b, { z16.b }, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e89 // tbx z9.b, z20.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e84 // tbx z4.b, z20.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e83 // tbx z3.b, z20.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e82 // tbx z2.b, z20.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e81 // tbx z1.b, z20.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e80 // tbx z0.b, z20.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2ea9 // tbx z9.b, z21.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2ea4 // tbx z4.b, z21.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282ea3 // tbx z3.b, z21.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272ea2 // tbx z2.b, z21.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262ea1 // tbx z1.b, z21.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252ea0 // tbx z0.b, z21.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "addvl x21, x21, #-6\n" + ".inst 0x052b2ec9 // tbx z9.b, z22.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2ec4 // tbx z4.b, z22.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282ec3 // tbx z3.b, z22.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272ec2 // tbx z2.b, z22.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262ec1 // tbx z1.b, z22.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252ec0 // tbx z0.b, z22.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "cmp x21, XZR\n" + ".inst 0x052b2ee9 // tbx z9.b, z23.b, z11.b\n" + ".inst 0x052a2ee4 // tbx z4.b, z23.b, z10.b\n" + ".inst 0x05282ee3 // tbx z3.b, z23.b, z8.b\n" + "st1b { z9.b }, p5, [x22]\n" + ".inst 0x05272ee2 // tbx z2.b, z23.b, z7.b\n" + ".inst 0x05262ee1 // tbx z1.b, z23.b, z6.b\n" + "st1b { z4.b }, p4, [x22, #1, MUL VL]\n" + ".inst 0x05252ee0 // tbx z0.b, z23.b, z5.b\n" + "st1b { z3.b }, p3, [x22, #2, MUL VL]\n" + "addvl x23, x23, #6\n" + "st1b { z2.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z1.b }, p1, [x22, #4, MUL VL]\n" + "st1b { z0.b }, p0, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "bgt 6b\n" + "b 17f\n" + "8:" // 512 bits + "mov z12.b, #0x40\n" + "mov x21, %x[string_length]\n" + "ptrue p5.b\n" + "ptrue p4.b\n" + "ptrue p3.b\n" + "ptrue p2.b\n" + "ptrue p1.b\n" + "ptrue p0.b\n" + "9:" // 4 rounds: width loop + "addvl x20, x21, #-6\n" + "cmp x20, XZR\n" + "bge 10f\n" + "mov x20, #0x0\n" + "addvl x20, x20, #1\n" + "whilelt p5.b, XZR, x21\n" + "whilelt p4.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p3.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p2.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p1.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p0.b, x20, x21\n" + "10:" // 4 rounds: predicate OK + "ld1b { z11.b }, p5/Z, [x23]\n" + "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n" + "tbl z9.b, { z16.b }, z11.b\n" + "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n" + "sub z11.b, z11.b, z12.b\n" + "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n" + "tbl z4.b, { z16.b }, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + "tbl z3.b, { z16.b }, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + "tbl z2.b, { z16.b }, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + "tbl z1.b, { z16.b }, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + "tbl z0.b, { z16.b }, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "addvl x21, x21, #-6\n" + ".inst 0x052b2e49 // tbx z9.b, z18.b, z11.b\n" + "sub z11.b, z11.b, z12.b\n" + ".inst 0x052a2e44 // tbx z4.b, z18.b, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + ".inst 0x05282e43 // tbx z3.b, z18.b, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + ".inst 0x05272e42 // tbx z2.b, z18.b, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + ".inst 0x05262e41 // tbx z1.b, z18.b, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + ".inst 0x05252e40 // tbx z0.b, z18.b, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "cmp x21, XZR\n" + ".inst 0x052b2e69 // tbx z9.b, z19.b, z11.b\n" + ".inst 0x052a2e64 // tbx z4.b, z19.b, z10.b\n" + ".inst 0x05282e63 // tbx z3.b, z19.b, z8.b\n" + "st1b { z9.b }, p5, [x22]\n" + ".inst 0x05272e62 // tbx z2.b, z19.b, z7.b\n" + ".inst 0x05262e61 // tbx z1.b, z19.b, z6.b\n" + "st1b { z4.b }, p4, [x22, #1, MUL VL]\n" + ".inst 0x05252e60 // tbx z0.b, z19.b, z5.b\n" + "st1b { z3.b }, p3, [x22, #2, MUL VL]\n" + "addvl x23, x23, #6\n" + "st1b { z2.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z1.b }, p1, [x22, #4, MUL VL]\n" + "st1b { z0.b }, p0, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "bgt 9b\n" + "b 17f\n" + "11:" // 1024 bits + "mov z12.b, #0x80\n" + "mov x21, %x[string_length]\n" + "ptrue p5.b\n" + "ptrue p4.b\n" + "ptrue p3.b\n" + "ptrue p2.b\n" + "ptrue p1.b\n" + "ptrue p0.b\n" + "12:" // 2 rounds: width loop + "addvl x20, x21, #-6\n" + "cmp x20, XZR\n" + "bge 13f\n" + "mov x20, #0x0\n" + "addvl x20, x20, #1\n" + "whilelt p5.b, XZR, x21\n" + "whilelt p4.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p3.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p2.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p1.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p0.b, x20, x21\n" + "13:" // 2 rounds: predicate OK + "ld1b { z11.b }, p5/Z, [x23]\n" + "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n" + "addvl x21, x21, #-6\n" + "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n" + "tbl z9.b, { z16.b }, z11.b\n" + "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n" + "sub z11.b, z11.b, z12.b\n" + "tbl z4.b, { z16.b }, z10.b\n" + "sub z10.b, z10.b, z12.b\n" + "tbl z3.b, { z16.b }, z8.b\n" + "sub z8.b, z8.b, z12.b\n" + "tbl z2.b, { z16.b }, z7.b\n" + "sub z7.b, z7.b, z12.b\n" + "tbl z1.b, { z16.b }, z6.b\n" + "sub z6.b, z6.b, z12.b\n" + "tbl z0.b, { z16.b }, z5.b\n" + "sub z5.b, z5.b, z12.b\n" + "cmp x21, XZR\n" + ".inst 0x052b2e29 // tbx z9.b, z17.b, z11.b\n" + ".inst 0x052a2e24 // tbx z4.b, z17.b, z10.b\n" + ".inst 0x05282e23 // tbx z3.b, z17.b, z8.b\n" + "st1b { z9.b }, p5, [x22]\n" + ".inst 0x05272e22 // tbx z2.b, z17.b, z7.b\n" + ".inst 0x05262e21 // tbx z1.b, z17.b, z6.b\n" + "st1b { z4.b }, p4, [x22, #1, MUL VL]\n" + ".inst 0x05252e20 // tbx z0.b, z17.b, z5.b\n" + "st1b { z3.b }, p3, [x22, #2, MUL VL]\n" + "addvl x23, x23, #6\n" + "st1b { z2.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z1.b }, p1, [x22, #4, MUL VL]\n" + "st1b { z0.b }, p0, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "bgt 12b\n" + "b 17f\n" + "14:" // 2048 bits + "mov x21, %x[string_length]\n" + "ptrue p5.b\n" + "ptrue p4.b\n" + "ptrue p3.b\n" + "ptrue p2.b\n" + "ptrue p1.b\n" + "ptrue p0.b\n" + "15:" // 1 rounds: width loop + "addvl x20, x21, #-6\n" + "cmp x20, XZR\n" + "bge 16f\n" + "mov x20, #0x0\n" + "addvl x20, x20, #1\n" + "whilelt p5.b, XZR, x21\n" + "whilelt p4.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p3.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p2.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p1.b, x20, x21\n" + "addvl x20, x20, #1\n" + "whilelt p0.b, x20, x21\n" + "16:" // 1 rounds: predicate OK + "addvl x21, x21, #-6\n" + "ld1b { z11.b }, p5/Z, [x23]\n" + "ld1b { z10.b }, p4/Z, [x23, #1, MUL VL]\n" + "ld1b { z8.b }, p3/Z, [x23, #2, MUL VL]\n" + "ld1b { z7.b }, p2/Z, [x23, #3, MUL VL]\n" + "cmp x21, XZR\n" + "ld1b { z6.b }, p1/Z, [x23, #4, MUL VL]\n" + "ld1b { z5.b }, p0/Z, [x23, #5, MUL VL]\n" + "tbl z9.b, { z16.b }, z11.b\n" + "tbl z4.b, { z16.b }, z10.b\n" + "tbl z3.b, { z16.b }, z8.b\n" + "st1b { z9.b }, p5, [x22]\n" + "tbl z2.b, { z16.b }, z7.b\n" + "tbl z1.b, { z16.b }, z6.b\n" + "st1b { z4.b }, p4, [x22, #1, MUL VL]\n" + "tbl z0.b, { z16.b }, z5.b\n" + "st1b { z3.b }, p3, [x22, #2, MUL VL]\n" + "addvl x23, x23, #6\n" + "st1b { z2.b }, p2, [x22, #3, MUL VL]\n" + "st1b { z1.b }, p1, [x22, #4, MUL VL]\n" + "st1b { z0.b }, p0, [x22, #5, MUL VL]\n" + "addvl x22, x22, #6\n" + "bgt 15b\n" + "17:" // SVE body done + "add x24, x24, #0x1\n" + "cmp x24, %x[num_strings]\n" + "bne 2b\n" + : [table] "+&r"(table) + : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length) + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", + "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", + "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE +#endif // __aarch64__ diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h new file mode 100644 index 0000000000..9acfe97728 --- /dev/null +++ b/src/cpu/kernels/lut/list.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_LUT_LIST_H +#define ACL_SRC_CPU_KERNELS_LUT_LIST_H + +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ + +#ifdef __aarch64__ +#define DECLARE_LUT_U8_KERNEL(func_name) \ + void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \ + uint8_t *const *output) + +DECLARE_LUT_U8_KERNEL(lut_u8_neon); +DECLARE_LUT_U8_KERNEL(lut_u8_sve2); + +#undef DECLARE_LUT_U8_KERNEL + +#define DECLARE_LUT_U16_KERNEL(func_name) \ + void func_name(const uint16_t *table, size_t num_strings, size_t string_length, const uint16_t *input, \ + uint16_t *output) + +DECLARE_LUT_U16_KERNEL(lut_u16_neon); +DECLARE_LUT_U16_KERNEL(lut_u16_sve); + +#undef DECLARE_LUT_U16_KERNEL + +#endif // __aarch64__ + +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_LUT_LIST_H diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp new file mode 100644 index 0000000000..e81ff92311 --- /dev/null +++ b/src/cpu/kernels/maxunpool/generic/neon/fp16.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/maxunpool/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +{ + return max_unpooling<float16_t>(input, indices, output, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp new file mode 100644 index 0000000000..ba0d7851a9 --- /dev/null +++ b/src/cpu/kernels/maxunpool/generic/neon/fp32.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/maxunpool/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +{ + return max_unpooling<float>(input, indices, output, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h new file mode 100644 index 0000000000..73a5b86a2f --- /dev/null +++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/wrapper/wrapper.h" +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +{ + Iterator input_itr(input, window); + Iterator indices_itr(indices, window); + auto out_ptr = reinterpret_cast<T *>(output->buffer()); + const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]); + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto vindices = reinterpret_cast<uint32_t *>(indices_itr.ptr()); + auto vinput = reinterpret_cast<T *>(input_itr.ptr()); + out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; + }, + input_itr, indices_itr); +} +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..53e601bba6 --- /dev/null +++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/maxunpool/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qs8_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +{ + return max_unpooling<int8_t>(input, indices, output, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..a3c346fba7 --- /dev/null +++ b/src/cpu/kernels/maxunpool/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/maxunpool/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qu8_maxunpooling(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +{ + return max_unpooling<uint8_t>(input, indices, output, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/maxunpool/list.h b/src/cpu/kernels/maxunpool/list.h new file mode 100644 index 0000000000..2c4fe940d9 --- /dev/null +++ b/src/cpu/kernels/maxunpool/list.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H +#define SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_MAXUNPOOL_KERNEL(func_name) \ + void func_name(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window) +DECLARE_MAXUNPOOL_KERNEL(neon_fp32_maxunpooling); +DECLARE_MAXUNPOOL_KERNEL(neon_fp16_maxunpooling); +DECLARE_MAXUNPOOL_KERNEL(neon_qs8_maxunpooling); +DECLARE_MAXUNPOOL_KERNEL(neon_qu8_maxunpooling); +#undef DECLARE_MAXUNPOOL_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_MAXUNPOOL_LIST_H diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp new file mode 100644 index 0000000000..344b9df0c8 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2022-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <> +void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + // Set build options + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Iterator input_itr(input, win); + Iterator output_itr(output, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr()); + + float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f)); + float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + sum_vec = vaddq_f16(sum_vec, data); + float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); + float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); + } + + float32x4_t sum_carry_res = + vpaddq_f32(vcvt_f32_f16(vget_high_f16(sum_vec)), vcvt_f32_f16(vget_low_f16(sum_vec))); + float sum = vaddvq_f32(sum_carry_res); + float sum_sq = vaddvq_f32(sum_sq_vec); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float fdata = static_cast<float>(*(in_ptr + x)); + sum += fdata; + sum_sq += fdata * fdata; + } + + float16_t mean = static_cast<float16_t>(sum / input->info()->dimension(0)); + float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon)); + + float16x8_t mean_vec = vdupq_n_f16(mean); + float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); + + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); + // Store results + vst1q_f16(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); +} + +void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + return mean_stddev_normalization<float16_t, 8>(input, output, epsilon, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp new file mode 100644 index 0000000000..4bff26b036 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + return mean_stddev_normalization<float, 4>(input, output, epsilon, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp new file mode 100644 index 0000000000..11f6294a35 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, int size> +void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type; + + // Set build options + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = size; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Iterator input_itr(input, win); + Iterator output_itr(output, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const ScalarType *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr()); + + auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); + auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + sum_vec = wrapper::vadd(sum_vec, data); + sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); + } + + auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); + auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); + for (int i = 0; i < size / 4; ++i) + { + sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); + sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); + } + + auto sum = wrapper::vgetlane(sum_carry_res, 0); + auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + ScalarType data = *(in_ptr + x); + sum += data; + sum_sq += data * data; + } + + ScalarType mean = sum / input->info()->dimension(0); + ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + ScalarType stddev_inv = 1.f / sqrt(var + epsilon); + + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); + // Store results + wrapper::vstore(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); +} +template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h new file mode 100644 index 0000000000..6466506f06 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H +#define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, int size> +void mean_stddev_normalization(ITensor *_input, ITensor *_output, float _epsilon, const Window &window); + +} // namespace cpu +} // namespace arm_compute + +#endif //define SRC_CORE_SVE_KERNELS_MEANSTDDEVNORM_IMPL_H diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..32654df5dc --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +namespace +{ +inline float32x4_t clamp_v4f32(float32x4_t block, float32x4_t quant_min_vec, float32x4_t quant_max_vec) +{ + return vminq_f32(vmaxq_f32(block, quant_min_vec), quant_max_vec); +} +inline uint16x8_t fuse_words_f32(float32x4_t fb1, float32x4_t fb2) +{ + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(fb1)), vmovn_u32(vcvtq_u32_f32(fb2))); +} +inline uint8x16_t fuse_shorts_u16(uint16x8_t sb1, uint16x8_t sb2) +{ + return vcombine_u8(vmovn_u16(sb1), vmovn_u16(sb2)); +} +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const int window_start_x = static_cast<int>(window.x().start()); + const int window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo qi_out = output->info()->quantization_info().uniform(); + const float output_scale = qi_out.scale; + const int output_offset = qi_out.offset; + + Iterator input_itr(input, win); + Iterator output_itr(output, win); + + const float output_inv_scale = 1.0f / output_scale; + const float32x4_t quant_max_vec = vdupq_n_f32(255.0f); + const float32x4_t quant_min_vec = vdupq_n_f32(0.0f); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const uint8_t *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr()); + + uint32x4_t sum_vec = vdupq_n_u32(0); + uint32x4_t sum_sq_vec = vdupq_n_u32(0); + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); + const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); + const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); + sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); + } + +#ifdef __aarch64__ + sum_vec = vpaddq_u32(sum_vec, sum_vec); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + uint32_t sum = vgetq_lane_u32(sum_vec, 0); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); +#elif __arm__ // #ifdef __aarch64__ + uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) + + vgetq_lane_u32(sum_vec, 3); + + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) + + vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3); +#endif // #ifdef __aarch64__ + for (; x < window_end_x; ++x) + { + auto data = static_cast<uint32_t>(*(in_ptr + x)); + sum += data; + sum_sq += (data * data); + } + + const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0))); + const float var = + (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean); + const float stdev_inv = 1.0f / sqrtf(var + epsilon); + const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); + const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); + float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); + db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); + db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); + db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); + db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); + const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); + vst1q_u8(out_ptr + x, out); + } + + for (; x < window_end_x; ++x) + { + auto data = static_cast<float32_t>(*(in_ptr + x)); + const uint8_t res = + data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); + *(out_ptr + x) = res; + } + }, + input_itr, output_itr); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/list.h b/src/cpu/kernels/meanstddevnorm/list.h new file mode 100644 index 0000000000..6277d65884 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/list.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H +#define SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_MEANSTDDEVNORM_KERNEL(func_name) \ + void func_name(ITensor *input, ITensor *output, float epsilon, const Window &window) + +DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp32_meanstddevnorm); +DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp16_meanstddevnorm); +DECLARE_MEANSTDDEVNORM_KERNEL(neon_qasymm8_meanstddevnorm); + +#undef DECLARE_MEANSTDDEVNORM_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_MEANSTDDEVNORM_LIST_H diff --git a/src/cpu/kernels/mul/generic/neon/fp16.cpp b/src/cpu/kernels/mul/generic/neon/fp16.cpp new file mode 100644 index 0000000000..920f298527 --- /dev/null +++ b/src/cpu/kernels/mul/generic/neon/fp16.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = {{ + vdupq_n_f16(broadcast_value), + vdupq_n_f16(broadcast_value), + }}; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t non_broadcast_v = {{ + vld1q_f16(non_broadcast_input_ptr + x), + vld1q_f16(non_broadcast_input_ptr + x + 8), + }}; + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t ta1 = {{ + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + }}; + const float16x8x2_t ta2 = {{ + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + }}; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/mul/generic/neon/fp32.cpp b/src/cpu/kernels/mul/generic/neon/fp32.cpp new file mode 100644 index 0000000000..3001eb5110 --- /dev/null +++ b/src/cpu/kernels/mul/generic/neon/fp32.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" + +namespace arm_compute +{ +namespace cpu +{ +void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(float); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type; + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/mul/generic/neon/list.h b/src/cpu/kernels/mul/generic/neon/list.h new file mode 100644 index 0000000000..710cb68b72 --- /dev/null +++ b/src/cpu/kernels/mul/generic/neon/list.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_MUL_KERNEL(func_name) \ + void func_name(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) + +DECLARE_MUL_KERNEL(mul_F32_F32_F32); +DECLARE_MUL_KERNEL(mul_F16_F16_F16); +#undef DECLARE_MUL_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_MUL_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp new file mode 100644 index 0000000000..f85fe7a31a --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/fp16.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/norm_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ + +void neon_normalize_float16_8_0_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float16_t, 8, 0, true>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_0( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float16_t, 8, 0, false>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_1_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float16_t, 8, 1, true>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_1( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float16_t, 8, 1, false>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float16_8_2( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float16_t, 8, 2, false>(window, in, in_squared, out, ninfo); +} + +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp new file mode 100644 index 0000000000..0b64f46956 --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/fp32.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/norm_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_normalize_float32_4_0_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float, 4, 0, true>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_0( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float, 4, 0, false>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_1_2D( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float, 4, 1, true>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_1( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float, 4, 1, false>(window, in, in_squared, out, ninfo); +} + +void neon_normalize_float32_4_2( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + arm_compute::normalize_float<float, 4, 2, false>(window, in, in_squared, out, ninfo); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/norm_layer/generic/neon/impl.h b/src/cpu/kernels/norm_layer/generic/neon/impl.h new file mode 100644 index 0000000000..6103165679 --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/impl.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/NormalizationHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +/** Function to perform normalization depending on the given template + * dimension. The second template parameter specifies whether the + * normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + * @param[in] in Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC. + * @param[in] in_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], + * Data type and layout supported: same as @p input. + * @param[in] out Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input. + * @param[in] ninfo Normalization layer information like the normalization type, normalization size and other parameters. + */ +template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm> +void normalize_float( + const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, NormalizationLayerInfo ninfo) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = S; + + Iterator input(in, win); + Iterator input_squared(in_squared, win); + Iterator output(out, win); + + const int dim_y = in->info()->data_layout() == DataLayout::NCHW ? 1 : 2; + const int radius = ninfo.norm_size() / 2; + const int input_squared_stride_x = in_squared->info()->strides_in_bytes()[0]; + const int input_squared_stride_slice = in_squared->info()->strides_in_bytes()[dim]; + const int input_squared_stride_row = in_squared->info()->strides_in_bytes()[dim_y]; + + const int max_right = in->info()->dimension(dim) - 1; + const int max_bottom = in->info()->dimension(dim_y) - 1; + + const auto coeff_vec = wrapper::vdup_n(static_cast<T>(ninfo.scale_coeff()), ExactTagType{}); + const auto beta_vec = wrapper::vdup_n(static_cast<T>(ninfo.beta()), ExactTagType{}); + const auto kappa_vec = wrapper::vdup_n(static_cast<T>(ninfo.kappa()), ExactTagType{}); + + auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, + const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, + T *output_ptr) + { + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = static_cast<T>(0.f); + for (int j = first_row; j <= last_row; ++j) + { + // Compute row displacement + const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu += + *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); + } + } + + // Normalize + const auto normalized = + std::pow(accu * static_cast<T>(ninfo.scale_coeff()) + static_cast<T>(ninfo.kappa()), ninfo.beta()); + const auto normalized_pixel = (*(input_ptr + x)) / normalized; + *(output_ptr + x) = normalized_pixel; + }; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + + int x = window_start_x; + // Compute serially starting elements for the case x dimension is width + for (; x < radius && x < window_end_x && dim == 0; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + + // Compute vectorized + for (; x <= window_end_x - window_step_x - radius; x += window_step_x) + { + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + for (int j = first_row; j <= last_row; ++j) + { + // Compute row displacement + const uint8_t *const input_squared_ptr = + input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu = wrapper::vadd( + accu, wrapper::vloadq(reinterpret_cast<const T *>( + input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + } + } + + // Normalize + const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); + const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); + wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + }, + input, input_squared, output); +} + +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/norm_layer/generic/neon/list.h b/src/cpu/kernels/norm_layer/generic/neon/list.h new file mode 100644 index 0000000000..f2e83d7af1 --- /dev/null +++ b/src/cpu/kernels/norm_layer/generic/neon/list.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_NORMALIZATION_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *in, const ITensor *in_squared, ITensor *out, \ + NormalizationLayerInfo ninfo) + +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_0); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_1); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float32_4_2); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_0); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1_2D); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_1); +DECLARE_NORMALIZATION_KERNEL(neon_normalize_float16_8_2); + +#undef DECLARE_NORMALIZATION_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp new file mode 100644 index 0000000000..9d24d79afb --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/fp16.cpp @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/impl.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +#ifdef ENABLE_NCHW_KERNELS + +namespace +{ +float16x4_t +read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval) +{ + float16_t vec[4]; + const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); + for (int i = 0; i < 4; i++) + { + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + { + vec[i] = *(ptr + i); + } + else + { + vec[i] = fval; + } + } + return wrapper::vload(vec); +} +} // namespace + +void pooling3_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f; + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + float16x4_t top_data = + read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value); + float16x4_t middle_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2, + reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value); + float16x4_t res = {}; + + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + middle_data = vmul_f16(middle_data, middle_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + // Perform pooling + const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); + res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); + res = vmul_f16(vpadd_f16(res, res), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); + res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data); + res = vpmax_f16(res, res); + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } + + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); +} +#endif // ENABLE_NCHW_KERNELS + +void pooling2_f16_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 8; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + Iterator indices(dst1, window_out); + + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + + const int pad_right = src->info()->padding().right; + const int pad_left = src->info()->padding().left; + const int pad_horizontal = pad_right + pad_left; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off; + const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off; + const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off; + const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off; + const auto v_x0 = vld1q_f16(in_x0_ptr); + const auto v_x1 = vld1q_f16(in_x1_ptr); + const auto v_x2 = vld1q_f16(in_x2_ptr); + const auto v_x3 = vld1q_f16(in_x3_ptr); + float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7}; + const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); + const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7}; + const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); + const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7}; + const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); + const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7}; + const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); + const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); + const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); + const uint16x8_t tmp_indices2 = + vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); + const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); + const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); + // Store indicies + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0); + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off); + float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +#ifdef ENABLE_NCHW_KERNELS + +void pooling2_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + if (pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window); + } + else + { + Iterator in(src, window_src); + Iterator out(dst0, window); + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x, pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_0, in_top_ptr, fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_1, in_bottom_ptr, fill_value); + float16x4_t res = {}; + + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + + const float16x4_t sum_data = vadd_f16(top_data, bottom_data); + res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(top_data, bottom_data); + res = vpmax_f16(max_data, max_data); + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); + } +} + +void poolingMxN_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + float16_t res = 0.0f; + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float16_t *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } + + res += data; + } + } + + // Divide by scale + res *= scale; + } + else // if max pooling + { + res = fp16_min; + + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float16_t *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = res; + }, + in, out); +} +#endif // ENABLE_NCHW_KERNELS + +void poolingMxN_fp16_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); + } + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 8; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t min_value = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + float16x8_t vres; + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + + // Perform pooling + vres = vdupq_n_f16(0.0f); + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float16x8_t data = vld1q_f16( + reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f16(vres, scale_v); + } + else + { + vres = vdupq_n_f16(min_value); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float16x8_t data = vld1q_f16( + reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f16(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); + vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal)); + } + + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + float16_t res = 0.0f; + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = + *(reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } + } + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float16_t data = + *(reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; + } + }, + in, out); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp new file mode 100644 index 0000000000..aaa37863cb --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/fp32.cpp @@ -0,0 +1,481 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void pooling2_f32_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 4; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + Iterator indices(dst1, window_out); + + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + + float32x4_t vres; + float res; + + const int pad_right = src->info()->padding().right; + const int pad_left = src->info()->padding().left; + const int pad_horizontal = pad_right + pad_left; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset); + const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset); + const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset); + const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset); + const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); + const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); + const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); + const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); + vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32x4_t voffset_x0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x1 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x2 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x3 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); + const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); + const uint32x4_t tmp_indices2 = + vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); + + // Store indices + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off); + res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +} // namespace + +void poolingMxN_fp32_neon_nhwc_kernel_indices( + const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + constexpr int window_step_x = 4; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator out(dst0, window_out); + Iterator indices(dst1, window_out); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + + float32x4_t vres; + uint32x4_t vidx; + + constexpr int idx_width = 1; + constexpr int idx_height = 2; + constexpr int idx_batch = 3; + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[idx_batch]); + + const int input_dim_w = src->info()->dimension(idx_width); + const int input_dim_h = src->info()->dimension(idx_height); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + + const int pool_start_x = std::max(0, -idx_width); + const int pool_start_y = std::max(0, -idx_height); + + const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width); + const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height); + + const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride; + + const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride); + const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride); + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + vres = vdupq_n_f32(min_value); + vidx = vdupq_n_u32(0U); + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + uint32_t curr_kernel_index = pool_size_x * pool_start_y; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + curr_kernel_index += pool_start_x; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x)); + const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index); + const uint32x4_t idxMask = vcgtq_f32(data, vres); + vidx = vbslq_u32(idxMask, vidx_curr, vidx); + vres = vmaxq_f32(vres, data); + in_ptr_x += y_stride; + curr_kernel_index++; + } + curr_kernel_index += (pool_size_x - pool_end_x); + in_ptr_y += z_stride; + } + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + float res = min_value; + uint32_t idx = 0U; + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = *(reinterpret_cast<const float *>(in_ptr_x)); + if (data > res) + { + idx = pool_size_x * y + x; + res = data; + } + in_ptr_x += y_stride; + } + in_ptr_y += z_stride; + } + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx; + } + }, + out, indices); +} + +void poolingMxN_fp32_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr)) + { + poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window); + } + else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && + !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr)) + { + pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); + } + else + { + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 4; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + const int pool_size_x = + pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = + pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + float32x4_t vres; + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); + + // Perform pooling + vres = vdupq_n_f32(0.0f); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32( + reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f32(vres, scale_v); + } + else + { + vres = vdupq_n_f32(min_value); + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32( + reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f32(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))}; + vres = l2_res; + } + + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + float res = 0.0f; + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = + *(reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } + } + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = + *(reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + } + }, + in, out); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/impl.h b/src/cpu/kernels/pool2d/neon/impl.h new file mode 100644 index 0000000000..008cf651e1 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/impl.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#include <limits> + +#ifdef ENABLE_NCHW_KERNELS +namespace arm_compute +{ +namespace cpu +{ + +namespace +{ +template <typename T> +auto read_2_boundary_aware_as_f32(int srcw, int srch, int pad_l, int pad_t, int x, int y, const T *ptr, T fval) +{ + T vec[2]; + const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); + for (int i = 0; i < 2; i++) + { + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + { + vec[i] = *(ptr + i); + } + else + { + vec[i] = fval; + } + } + float32_t vec_f32[2] = {vec[0], vec[1]}; + return wrapper::vload(vec_f32); +} +} // namespace + +template <typename T> +void pooling2_nchw_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + Iterator in(src, window_src); + Iterator out(dst0, window); + Iterator indices(dst1, window); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const int pad_left = src->info()->padding().left; + const int pad_right = src->info()->padding().right; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const T float_min = get_initial_min<T>(pool_info.use_inf_as_limit); + const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = + read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + read_2_boundary_aware_as_f32(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + + // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. + const float32x2_t max_data_top = vpmax_f32(top_data, top_data); + const float32x2_t max_data_bottom = vpmax_f32(bottom_data, bottom_data); + const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); + *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0)); + + // Calculate max data indice, which will be used in max unpool. + const uint32_t offset_base = + offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); + const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); + const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; + const uint32x2_t voffset_top = {offset_top, offset_top + 1u}; + const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u}; + const uint32x2_t tmp_indices_top = + vbsl_u32(vcge_f32(top_data, vrev64_f32(top_data)), voffset_top, vrev64_u32(voffset_top)); + const uint32x2_t tmp_indices_bottom = + vbsl_u32(vcge_f32(bottom_data, vrev64_f32(bottom_data)), voffset_bottom, vrev64_u32(voffset_bottom)); + *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32( + vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); + }, + in, out, indices); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ENABLE_NCHW_KERNELS + +#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_IMPL_H diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h new file mode 100644 index 0000000000..5db843d56b --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/list.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/pool2d/neon/quantized.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_POOLING_KERNEL(func_name) \ + void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \ + const Window &window) + +DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc); + +#if defined(ENABLE_NCHW_KERNELS) + +#if defined(ENABLE_FP16_KERNELS) +DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw); +DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw); +DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw); +#endif /* defined(ENABLE_FP16_KERNELS) */ + +DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw); +#endif /* defined(ENABLE_NCHW_KERNELS) */ + +#undef DECLARE_POOLING_KERNEL + +template <typename T> +T get_initial_min(bool use_inf_as_limit) +{ + return use_inf_as_limit ? -std::numeric_limits<T>::infinity() : std::numeric_limits<T>::lowest(); +} + +template <typename T> +inline uint32_t offset_no_padding(uint32_t padded_offset, + const Coordinates &id, + const ITensorInfo &info, + int pool_stride_x, + int pool_stride_y, + DataLayout data_layout) +{ + const int pad_left = info.padding().left; + const int pad_right = info.padding().right; + const int pad_top = info.padding().top; + const int pad_bottom = info.padding().bottom; + const int in_stride_y = static_cast<int>(info.strides_in_bytes().y()); + const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]); + const int pad_horiz = pad_left + pad_right; + const int pad_vert = pad_top + pad_bottom; + + if (data_layout == DataLayout::NCHW) + { + const uint32_t offset_base = + padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ + - pad_top * sizeof(T) /* top padding */ + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - + pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ + - in_stride_w * id[3]; + + return offset_base; + } + else + { + const uint32_t offset_base = padded_offset - + sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row + - pad_top * sizeof(T) // top padding + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * + pool_stride_y // for each Z plane there are width*pad_right padding elems + - in_stride_w * id[3]; + + return offset_base; + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_LIST_H diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp new file mode 100644 index 0000000000..0602bea667 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/impl.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#include <limits> + +#ifdef ENABLE_NCHW_KERNELS +namespace arm_compute +{ +namespace cpu +{ +#define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr) +#define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) \ + : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) +#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \ + ? vdup_n_f32(fval) \ + : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) + +#define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \ + READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval)) + +float32x4x2_t +read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval) +{ + float32x4x2_t vec; + vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval); + vec.val[1] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 4), y, (ptr + 4), fval); + return vec; +} + +void poolingMxN_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + float res = 0.0f; + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } + + res += data; + } + } + + // Divide by scale + res *= scale; + } + else // if max pooling + { + res = min_value; + + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = res; + }, + in, out); +} + +void pooling2_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + if (pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window); + } + else + { + Iterator in(src, window_src); + Iterator out(dst0, window); + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, + in_top_ptr, fill_value); + auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_bottom_ptr, fill_value); + float32x2_t res = {}; + float final_res = 0; + + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f32(top_data, top_data); + bottom_data = vmul_f32(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x2_t sum_data = vadd_f32(top_data, bottom_data); + res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + } + else + { + const float32x2_t max_data = vmax_f32(top_data, bottom_data); + res = vpmax_f32(max_data, max_data); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); + } +} + +void pooling3_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); + const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, + fill_value); + auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_middle_ptr, fill_value); + auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, + in_bottom_ptr, fill_value); + + float32x2_t res = {}; + float final_res = 0; + + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmulq_f32(top_data, top_data); + middle_data = vmulq_f32(middle_data, middle_data); + bottom_data = vmulq_f32(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); + res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); + } + else + { + const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data)); + res = vpmax_f32(res, res); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); +} + +void pooling7_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 7; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + std::array<const uint8_t *, pool_size> src_ptrs{{}}; + for (int i = 0; i < pool_size; ++i) + { + src_ptrs[i] = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i)); + } + + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset()); + + auto x_val = id.x() * pool_stride_x; + auto y_val = id.y() * pool_stride_y; + float32x4x2_t data = + read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); + + float32x2_t res = {}; + float final_res = 0.f; + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); + + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, + fill_value); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + sum_data = vaddq_f32(sum_data, data.val[0]); + sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + } + res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); + } + else + { + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); + + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, + in_ptr, fill_value); + data = vmax2q_f32(data, temp); + } + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1])); + res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0]))); + res = vpmax_f32(res, res); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ENABLE_NCHW_KERNELS diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp new file mode 100644 index 0000000000..44675b5394 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..d434323e89 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h new file mode 100644 index 0000000000..38f1b2f1f9 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/quantized.h @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H +#define SRC_CORE_NEON_KERNELS_QUANTIZED_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/PoolingHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void poolingMxN_q8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + const int window_half_step_x = window_step_x / 2; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const float32x4_t half_scale_v = vdupq_n_f32(0.5f); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; + // "new_offset" doesn't have to consider the "half_scale_v" in its computation + // With a requantization performed in a single step there won't be uncertainties introduced + const int32_t new_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if (pool_info.pool_type != PoolingType::MAX) + { + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } + } + + if (src_qinfo != dst_qinfo) + { + const float32x4x4_t vres = {{ + vcvtq_f32_q32(vres1), + vcvtq_f32_q32(vres2), + vcvtq_f32_q32(vres3), + vcvtq_f32_q32(vres4), + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, + wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } + } + else + { + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } + } + + if (pool_info.pool_type == PoolingType::MAX) + { + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) + { + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x8_t data = wrapper::vload( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + if (pool_info.pool_type != PoolingType::MAX) + { + q32_t res = static_cast<q32_t>(0.f); + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res += data; + } + } + + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + else + { + T res = std::numeric_limits<T>::min(); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); + } + } + + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + } + }, + in, out); +} + +#if defined(ENABLE_NCHW_KERNELS) +template <typename T, typename TVec> +inline void scale_vector_q16x8(bool exclude_padding, + TVec &v, + const Coordinates &id, + int id_offset, + int step, + const int pool_size, + const int upper_bound_w, + const int upper_bound_h, + const int pad_x, + const int pad_y, + const int stride_x, + const int stride_y) +{ + int start_x = (id.x() + id_offset) * stride_x - pad_x; + int start_y = id.y() * stride_y - pad_y; + const int end_y = std::min(start_y + pool_size, upper_bound_h); + if (exclude_padding) + { + start_y = std::max(0, start_y); + } + + std::array<T, 8> elems = {{ + wrapper::vgetlane(v, 0), + wrapper::vgetlane(v, 1), + wrapper::vgetlane(v, 2), + wrapper::vgetlane(v, 3), + wrapper::vgetlane(v, 4), + wrapper::vgetlane(v, 5), + wrapper::vgetlane(v, 6), + wrapper::vgetlane(v, 7), + }}; + + for (auto &el : elems) + { + int c_start_x = start_x; + const int end_x = std::min(c_start_x + pool_size, upper_bound_w); + if (exclude_padding) + { + c_start_x = std::max(0, c_start_x); + } + float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x)); + el *= scale; + start_x += step * stride_x; + } + + v = wrapper::vsetlane(elems[0], v, 0); + v = wrapper::vsetlane(elems[1], v, 1); + v = wrapper::vsetlane(elems[2], v, 2); + v = wrapper::vsetlane(elems[3], v, 3); + v = wrapper::vsetlane(elems[4], v, 4); + v = wrapper::vsetlane(elems[5], v, 5); + v = wrapper::vsetlane(elems[6], v, 6); + v = wrapper::vsetlane(elems[7], v, 7); +} + +template <typename T> +auto load16_boundary_aware( + int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval) +{ + ARM_COMPUTE_UNUSED(pad_b, pad_r); + T vec[16]; + //handle reading a row out of the tensor + const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); + for (int i = 0; i < 16; i++) + { + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + { + vec[i] = *(ptr + i); + } + else + { + vec[i] = fval; + } + } + return wrapper::vloadq(vec); +} + +template <typename T, typename V, bool deinterleave> +inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr) +{ + if (deinterleave) + { + for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i) + { + *(ptr + i * 2) = lower[i]; + } + for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i) + { + *(ptr + 1 + i * 2) = upper[i]; + } + } + else + { + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) + { + *(ptr + i) = lower[i]; + } + for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i) + { + *(ptr + i + 8) = upper[i]; + } + } +} + +template <typename T, typename V> +inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr) +{ + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) + { + *(ptr + i) = v[i]; + } +} + +template <typename T> +void pooling2_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; + + constexpr int pool_size = 2; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const T *const src_top_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + const int scale_step_x = (pool_stride_x == 1) ? 2 : 1; + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + const bool have_different_qinfo = src_qinfo != dst_qinfo; + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int dst_w = dst0->info()->dimension(0); + + const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + + q8x8_t lower_res = {}; + q8x8_t upper_res = {}; + + if (pool_info.pool_type != PoolingType::MAX) + { + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Add rows + const q16x8x2_t vrsum = {{ + wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), + wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), + }}; + + // Pair-wise add row data + const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); + const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); + + q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); + + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + lower_res = wrapper::vmovn(res_lower); + + // Compute upper result for stride_x == 1 + if (pool_stride_x == 1) + { + // Shifted row sum + const q16x8x2_t vrsum_shifted = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + + // Pair-wise add shifted row + q16x8_t res_upper = wrapper::vcombine( + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), + wrapper::vgethigh(vrsum_shifted.val[1]))); + + // Scale upper result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + upper_res = wrapper::vmovn(res_upper); + } + } + else + { + const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); + lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); + if (pool_stride_x == 1) + { + const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); + upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + } + } + + if (have_different_qinfo) + { + const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo); + lower_res = wrapper::vgetlow(requantized_dst); + upper_res = wrapper::vgethigh(requantized_dst); + } + auto out_ptr = reinterpret_cast<T *>(out.ptr()); + // Store result + if (pool_stride_x == 1) + { + write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr); + } + else + { + write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr); + } + }, + in, out); +} + +template <typename T> +void pooling3_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; + + constexpr int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + const T *const src_top_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_middle_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2))); + + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min(); + const int dst_w = dst0->info()->dimension(0); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto middle_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + + q8x8_t fres = {}; + q8x16_t fqres = {}; + + if (pool_info.pool_type == PoolingType::AVG) + { + // Convert data to u16 + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t middle_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Calculate row sums + const q16x8x2_t vrsum = {{ + wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), + wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + }}; + const q16x8x2_t vrsum_shifted_1 = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + const q16x8x2_t vrsum_shifted_2 = { + {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}}; + // Calculate final sum + q16x8x2_t final_sum = {{ + wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), + wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + }}; + if (pool_stride_x == 2) + { + q16x8_t res = { + wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2), + wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6), + wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2), + wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6), + }; + + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fres = wrapper::vmovn(res); + } + else + { + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); + } + } + else + { + const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); + const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); + const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); + const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); + + if (pool_stride_x == 2) + { + const q8x8x2_t table = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}}; + static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14}; + fres = wrapper::vtbl(table, lookup_val); + } + else + { + fqres = final_max; + } + } + + // Store result + if (pool_stride_x == 1) + { + if (src_qinfo != dst_qinfo) + { + fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), + requant_qinfo); + } + write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), + wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr())); + } + else + { + if (src_qinfo != dst_qinfo) + { + fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo); + } + write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr())); + } + }, + in, out); +} + +template <typename T> +void poolingMxN_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q16_t = typename wrapper::traits::promote_t<T>; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min(); + const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x()); + const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y()); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + T res = std::numeric_limits<T>::min(); + + if (pool_info.pool_type != PoolingType::MAX) + { + q32_t sres = 0; + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast<const T *>( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + sres += data; + } + } + // Divide by scale + res = static_cast<T>(support::cpp11::round(sres * scale)); + } + else + { + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast<const T *>( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + res = std::max(res, data); + } + } + } + // Store result + res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize( + Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) + : res; + *(reinterpret_cast<T *>(out.ptr())) = res; + }, + in, out); +} +#endif /* defined(ENABLE_NCHW_KERNELS) */ +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h new file mode 100644 index 0000000000..3426360f93 --- /dev/null +++ b/src/cpu/kernels/pool3d/list.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H +#define SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_POOLING_KERNEL(func_name) \ + void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window) + +DECLARE_POOLING_KERNEL(neon_q8_pool3d); +DECLARE_POOLING_KERNEL(neon_q8_signed_pool3d); +DECLARE_POOLING_KERNEL(neon_fp16_pool3d); +DECLARE_POOLING_KERNEL(neon_fp32_pool3d); + +#undef DECLARE_POOLING_KERNEL + +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/pool3d/neon/fp16.cpp b/src/cpu/kernels/pool3d/neon/fp16.cpp new file mode 100644 index 0000000000..0130a96098 --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/fp16.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/pool3d/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + return poolingMxNxD_fp_neon_ndhwc<float16_t>(src, dst0, pool_info, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/pool3d/neon/fp32.cpp b/src/cpu/kernels/pool3d/neon/fp32.cpp new file mode 100644 index 0000000000..2c06a9d57a --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/fp32.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/pool3d/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + return poolingMxNxD_fp_neon_ndhwc<float>(src, dst0, pool_info, window); +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h new file mode 100644 index 0000000000..ce89199b5d --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/impl.h @@ -0,0 +1,484 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_POOLING_3D_LAYER_IMPL_H +#define SRC_CORE_POOLING_3D_LAYER_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/helpers/PoolingHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/pool3d/neon/quantized.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +template <typename T> +void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) + +{ + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + int pool_stride_x = static_cast<int>(pool_info.stride.width); + int pool_stride_y = static_cast<int>(pool_info.stride.height); + int pool_stride_z = static_cast<int>(pool_info.stride.depth); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth; + + const int pool_pad_top = static_cast<int>(pool_info.padding.top); + const int pool_pad_left = static_cast<int>(pool_info.padding.left); + const int pool_pad_front = static_cast<int>(pool_info.padding.front); + + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + Iterator out(dst0, window_out); + + vector_type vres; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C + { + vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vmax(vres, data); + } + } + } + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + T res(0); + res = -std::numeric_limits<float>::infinity(); + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res = std::max(res, data); + } + } + } + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + }, + out); +} + +template <typename T> +void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) +{ + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + int pool_stride_x = static_cast<int>(pool_info.stride.width); + int pool_stride_y = static_cast<int>(pool_info.stride.height); + int pool_stride_z = static_cast<int>(pool_info.stride.depth); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth; + + const int pool_pad_top = static_cast<int>(pool_info.padding.top); + const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom); + const int pool_pad_left = static_cast<int>(pool_info.padding.left); + const int pool_pad_right = static_cast<int>(pool_info.padding.right); + const int pool_pad_front = static_cast<int>(pool_info.padding.front); + const int pool_pad_back = static_cast<int>(pool_info.padding.back); + + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back); + + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + Iterator out(dst0, window_out); + + vector_type vres; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C + { + // Perform pooling + vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vadd(vres, data); + } + } + } + + // Divide by scale + vres = wrapper::vmul(vres, scale_v); + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data; + } + } + } + + // Divide by scale + res *= scale; + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + }, + out); +} + +template <typename T> +void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) +{ + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + int pool_stride_x = static_cast<int>(pool_info.stride.width); + int pool_stride_y = static_cast<int>(pool_info.stride.height); + int pool_stride_z = static_cast<int>(pool_info.stride.depth); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth; + + const int pool_pad_top = static_cast<int>(pool_info.padding.top); + const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom); + const int pool_pad_left = static_cast<int>(pool_info.padding.left); + const int pool_pad_right = static_cast<int>(pool_info.padding.right); + const int pool_pad_front = static_cast<int>(pool_info.padding.front); + const int pool_pad_back = static_cast<int>(pool_info.padding.back); + + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back); + + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + Iterator out(dst0, window_out); + + vector_type vres; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C + { + // Perform pooling + vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vmla(vres, data, data); + } + } + } + + const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); + + // Divide by scale + vres = wrapper::vmul(vres, scale_v); + + // Calculate square-root + vres = wrapper::vinv(wrapper::vinvsqrt(vres)); + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data * data; + } + } + } + + // Divide by scale + res *= scale; + + // Square root + res = std::sqrt(res); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + }, + out); +} +} // namespace + +template <typename T> +void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + constexpr int window_step_x = 16 / sizeof(T); + Window window_out = window; + + // Needed to handle loop left-over + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + switch (pool_info.pool_type) + { + case PoolingType::MAX: + max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); + break; + case PoolingType::AVG: + avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); + break; + case PoolingType::L2: + l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); + break; + default: + ARM_COMPUTE_ERROR("Pool operation not supported"); + } +} + +template <typename T> +void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + constexpr int window_step_x = 16; + Window window_out = window; + + // Needed to handle loop left-over + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + switch (pool_info.pool_type) + { + case PoolingType::MAX: + max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x); + break; + case PoolingType::AVG: + avg_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x); + break; + default: + ARM_COMPUTE_ERROR("Pool operation not supported"); + } +} +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H diff --git a/src/cpu/kernels/pool3d/neon/qasymm8.cpp b/src/cpu/kernels/pool3d/neon/qasymm8.cpp new file mode 100644 index 0000000000..650a815e76 --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/qasymm8.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/pool3d/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_q8_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + return poolingMxNxD_q8_neon_ndhwc<uint8_t>(src, dst0, pool_info, window); +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..374b2435ea --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/qasymm8_signed.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/pool3d/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_q8_signed_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window) +{ + return poolingMxNxD_q8_neon_ndhwc<int8_t>(src, dst0, pool_info, window); +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h new file mode 100644 index 0000000000..8819907901 --- /dev/null +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H +#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/core/helpers/PoolingHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void avg_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) + +{ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; + + int pool_stride_x = static_cast<int>(pool_info.stride.width); + int pool_stride_y = static_cast<int>(pool_info.stride.height); + int pool_stride_z = static_cast<int>(pool_info.stride.depth); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth; + + const int pool_pad_top = static_cast<int>(pool_info.padding.top); + const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom); + const int pool_pad_left = static_cast<int>(pool_info.padding.left); + const int pool_pad_right = static_cast<int>(pool_info.padding.right); + const int pool_pad_front = static_cast<int>(pool_info.padding.front); + const int pool_pad_back = static_cast<int>(pool_info.padding.back); + + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back); + + const int input_dim_c = src->info()->dimension(0); + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + const int window_end_x = input_dim_c; + const int window_start_x = 0; + + Iterator out(dst0, window_out); + + const float32x4_t half_scale_v = vdupq_n_f32(0.5f); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; + // "new_offset" doesn't have to consider the "half_scale_v" in its computation + // With a requantization performed in a single step there won't be uncertainties introduced + const int32_t new_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C + { + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } + } + } + + if (src_qinfo != dst_qinfo) + { + const float32x4x4_t vres = {{ + vcvtq_f32_q32(vres1), + vcvtq_f32_q32(vres2), + vcvtq_f32_q32(vres3), + vcvtq_f32_q32(vres4), + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + q32_t res = static_cast<q32_t>(0.f); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data; + } + } + } + + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + }, + out); +} + +template <typename T> +void max_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) + +{ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + + const int window_half_step_x = window_step_x / 2; + + int pool_stride_x = static_cast<int>(pool_info.stride.width); + int pool_stride_y = static_cast<int>(pool_info.stride.height); + int pool_stride_z = static_cast<int>(pool_info.stride.depth); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth; + + const int pool_pad_top = static_cast<int>(pool_info.padding.top); + const int pool_pad_left = static_cast<int>(pool_info.padding.left); + const int pool_pad_front = static_cast<int>(pool_info.padding.front); + + const int input_dim_c = src->info()->dimension(0); + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + const int input_dim_d = src->info()->dimension(3); + + const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); + const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); + const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]); + const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + + const int window_end_x = input_dim_c; + const int window_start_x = 0; + + Iterator out(dst0, window_out); + + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + int x_off = window_start_x; + + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C + { + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } + + // Leftovers using half the window step + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) + { + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + T res = std::numeric_limits<T>::min(); + + for (int z = pool_start_z; z < pool_end_z; ++z) + { + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + res = std::max(res, data); + } + } + } + + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + }, + out); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H diff --git a/src/cpu/kernels/quantize/generic/neon/fp16.cpp b/src/cpu/kernels/quantize/generic/neon/fp16.cpp new file mode 100644 index 0000000000..37bfb5b2aa --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp16.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, uint8_t>(src, dst, window); +} +void fp16_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float16_t, int8_t>(src, dst, window); +} +void fp16_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float16_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/quantize/generic/neon/fp32.cpp b/src/cpu/kernels/quantize/generic/neon/fp32.cpp new file mode 100644 index 0000000000..0cba332fd6 --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/fp32.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, uint8_t>(src, dst, window); +} +void fp32_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<float, int8_t>(src, dst, window); +} +void fp32_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<float>(src, dst, window); +} + +void fp32_i8_run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qsymm8<float, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h new file mode 100644 index 0000000000..9954a7645e --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/impl.h @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +constexpr auto window_step = 16; + +template <typename T> +inline float32x4x4_t load_value(const T *input_ptr) +{ + using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; + return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); +} + +template <> +inline float32x4x4_t load_value(const float *input_ptr) +{ + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12)}; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float32x4x4_t load_value(const float16_t *input_ptr) +{ + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename element_type> +using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; + +template <typename quantized_type> +inline vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); + +template <> +inline vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize(qv, qi); +} + +template <> +inline vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize_signed(qv, qi); +} + +template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type> +inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper)); +} + +template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type> +inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper) +{ + return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper)); +} + +template <typename TIn, typename TOut> +void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Calculate output offset difference. + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const wrapper::traits::neon_vector_t<TIn, window_step> qv = + wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + + // Signed addition. + auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset); + + // Output is dependent on datatype. + wrapper::vstore(&output_ptr[x], + reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Duplicate offset in signed vector format + const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{}); + + const int32_t low_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128; + const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127; + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + const auto qv = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype + int16x8_t lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv))); + int16x8_t upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv))); + + // Signed addition. + lower = wrapper::vqadd(lower, offset); + upper = wrapper::vqadd(upper, offset); + + // Output is dependent on datatype. + auto res = recombine_8_16<TOut>(lower, upper); + wrapper::vstore(&output_ptr[x], res); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Add offset and clamp result to within the range of the output datatype. + int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]); + result = utility::clamp<int32_t>(result, low_bound, upper_bound); + + // Cast result to output datatype. + output_ptr[x] = static_cast<TOut>(result); + } + }, + input, output); +} + +template <typename TIn, typename TOut> +void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + +template <typename T> +void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if (is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/quantize/generic/neon/integer.cpp b/src/cpu/kernels/quantize/generic/neon/integer.cpp new file mode 100644 index 0000000000..4e39afaaee --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/integer.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/quantize/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void u8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm8<int8_t, int8_t>(src, dst, window); +} + +void u8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<uint8_t>(src, dst, window); +} +void i8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + run_quantize_qasymm16<int8_t>(src, dst, window); +} + +void u8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<uint8_t, int8_t>(src, dst, window); +} +void i8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, uint8_t>(src, dst, window); +} +void i8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only<int8_t, int8_t>(src, dst, window); +} + +void i8_u8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<int8_t, uint8_t>(src, dst, window); +} +void u8_i8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window) +{ + run_requantize_offset_only_convert<uint8_t, int8_t>(src, dst, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/quantize/generic/neon/list.h b/src/cpu/kernels/quantize/generic/neon/list.h new file mode 100644 index 0000000000..c4fb1048eb --- /dev/null +++ b/src/cpu/kernels/quantize/generic/neon/list.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_QUANTIZE_KERNEL(func_name) void func_name(const ITensor *src, ITensor *dst, const Window &window) + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_quantize_qasymm8); + +DECLARE_QUANTIZE_KERNEL(u8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only); +DECLARE_QUANTIZE_KERNEL(i8_i8_run_requantize_offset_only); + +DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only_convert); +DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only_convert); + +DECLARE_QUANTIZE_KERNEL(u8_run_quantize_qasymm16); +DECLARE_QUANTIZE_KERNEL(i8_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp32_run_quantize_qasymm16); + +DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qsymm8); + +DECLARE_QUANTIZE_KERNEL(fp16_u8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_i8_run_quantize_qasymm8); +DECLARE_QUANTIZE_KERNEL(fp16_run_quantize_qasymm16); + +#undef DECLARE_QUANTIZE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp new file mode 100644 index 0000000000..505c18c27d --- /dev/null +++ b/src/cpu/kernels/range/generic/neon/fp16.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp16_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<float16_t>(output, start, step, window); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp new file mode 100644 index 0000000000..e5e472abb5 --- /dev/null +++ b/src/cpu/kernels/range/generic/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void fp32_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<float32_t>(output, start, step, window); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h new file mode 100644 index 0000000000..f8c30d52a0 --- /dev/null +++ b/src/cpu/kernels/range/generic/neon/impl.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>::tag_type; + + const auto step_vec = wrapper::vdup_n(static_cast<T>(step), ExactTagType{}); + const auto start_vec = wrapper::vdup_n(static_cast<T>(start), ExactTagType{}); + auto id_vec = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16 / sizeof(T); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator output_it(output, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + for (int count = 0; count < window_step_x; ++count) + { + id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count); + } + + // start + step * id + const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); + wrapper::vstore(out_ptr + x, res_vec); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto res = start + x * step; + *(out_ptr + x) = res; + } + }, + output_it); +} +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_RANGE_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/range/generic/neon/integer.cpp b/src/cpu/kernels/range/generic/neon/integer.cpp new file mode 100644 index 0000000000..0f3ff89b71 --- /dev/null +++ b/src/cpu/kernels/range/generic/neon/integer.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/range/generic/neon/impl.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +void u8_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<uint8_t>(output, start, step, window); +} + +void u16_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<uint16_t>(output, start, step, window); +} + +void u32_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<uint32_t>(output, start, step, window); +} + +void s8_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<int8_t>(output, start, step, window); +} + +void s16_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<int16_t>(output, start, step, window); +} + +void s32_neon_range_function(ITensor *output, float start, float step, const Window &window) +{ + return neon_range_function<int32_t>(output, start, step, window); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h new file mode 100644 index 0000000000..cade91e8dd --- /dev/null +++ b/src/cpu/kernels/range/list.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_RANGE_LIST_H +#define SRC_CORE_NEON_KERNELS_RANGE_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window) + +DECLARE_RANGE_KERNEL(fp16_neon_range_function); +DECLARE_RANGE_KERNEL(fp32_neon_range_function); +DECLARE_RANGE_KERNEL(s8_neon_range_function); +DECLARE_RANGE_KERNEL(s16_neon_range_function); +DECLARE_RANGE_KERNEL(s32_neon_range_function); +DECLARE_RANGE_KERNEL(u8_neon_range_function); +DECLARE_RANGE_KERNEL(u16_neon_range_function); +DECLARE_RANGE_KERNEL(u32_neon_range_function); + +#undef DECLARE_RANGE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_RANGE_LIST_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp new file mode 100644 index 0000000000..143bb5487f --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/fp16.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceY_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceZ_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op); +} + +void reduce_RedOpYZW_reduceW_float16_8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp new file mode 100644 index 0000000000..6f5f13e571 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/fp32.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ( + window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op); +} + +void reduce_RedOpX_reduceX_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceY_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceZ_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op); +} + +void reduce_RedOpYZW_reduceW_float32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/impl.h b/src/cpu/kernels/reduction_layer/generic/neon/impl.h new file mode 100644 index 0000000000..3fa821d3a4 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/impl.h @@ -0,0 +1,1633 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "support/SaturateCast.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +// Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized +template <typename T> +void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) +{ + if (std::is_same<T, uint8_t>::value) + { + auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); + wrapper::vstore(output.ptr() + offset, res); + } + else + { + auto res = wrapper::vcombine(wrapper::vqmovn(t1), wrapper::vqmovn(t2)); + wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res); + } +} + +template <typename T> +uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4_t mask{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask = wrapper::vcgt(b, a); + } + else + { + mask = wrapper::vclt(b, a); + } + + uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; + if (axis != 0) + { + vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; + + return res; +} + +template <typename T> +uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x4_t mask{{0}}; + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask_u8 = wrapper::vcgt(b, a); + } + else + { + mask_u8 = wrapper::vclt(b, a); + } + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + mask.val[0] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + mask.val[1] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + mask.val[2] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + mask.val[3] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + + uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, + {idx + 4, idx + 5, idx + 6, idx + 7}, + {idx + 8, idx + 9, idx + 10, idx + 11}, + {idx + 12, idx + 13, idx + 14, idx + 15}}}; + if (axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = { + {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; + + return res; +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_min(T in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_min(T in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value, + typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type +calculate_max(T in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + return wrapper::vpmax(pmax, pmax); +} + +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +template <typename T> +inline typename std::enable_if< + std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value, + typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type +calculate_max(T in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +template <typename T> +uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) +{ + uint32x4_t res_idx_mask{0}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + else + { + auto pmax = calculate_max(vec_res_value); + auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask); + } + + res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones); + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask)); + pmin = wrapper::vpmin(pmin, pmin); + uint32_t res = wrapper::vgetlane(pmin, 0); + + return (res - 0xFFFFFFFF); +} + +template <typename T> +uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) +{ + uint32x4x4_t res_idx_mask{{0}}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = calculate_max(vec_res_value); + mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); + res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones); + res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones); + + uint32_t res = 0xFFFFFFFF; + int iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } while (iter < 4); + + return (res - 0xFFFFFFFF); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +uint32x4x4_t inline calculate_index( + uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +{ + uint32x4x2_t mask{0}; + uint16x8_t mask_u16{0}; + if (op == ReductionOperation::ARG_IDX_MIN) + { + mask_u16 = wrapper::vcgt(b, a); + } + else + { + mask_u16 = wrapper::vclt(b, a); + } + mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); + mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); + uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; + if (axis != 0) + { + vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); + } + uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; + + return res; +} + +// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. +inline float16x4_t calculate_min(float16x8_t in) +{ + auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmin = wrapper::vpmin(pmin, pmin); + return wrapper::vpmin(pmin, pmin); +} +// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. +inline float16x4_t calculate_max(float16x8_t in) +{ + auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); + pmax = wrapper::vpmax(pmax, pmax); + return wrapper::vpmax(pmax, pmax); +} + +template <> +inline uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) +{ + uint32x4x2_t res_idx_mask{0}; + uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); + uint16x8_t mask_u16; + if (op == ReductionOperation::ARG_IDX_MIN) + { + auto pmin = calculate_min(vec_res_value); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); + } + else + { + auto pmax = calculate_max(vec_res_value); + mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax)); + } + + // Widen vectors + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); + res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); + res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); + res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones); + + uint32_t res = 0xFFFFFFFF; + uint32_t iter = 0; + do + { + auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter])); + pmin = wrapper::vpmin(pmin, pmin); + res = std::min(wrapper::vgetlane(pmin, 0), res); + iter++; + } while (iter < 2); + + return (res - 0xFFFFFFFF); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <class F> +class Reducer +{ +public: + static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + f(window, out_window, input, output, op); + } + static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimY, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1))); + + f(in_window, out_window, input, output, 1, op); + } + static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in window + Window in_window(window); + Window out_window(window); + + in_window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2))); + + f(in_window, out_window, input, output, 2, op); + } + static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op) + { + // Set in/out window + Window in_window(window); + Window out_window(window); + + in_window.set(3, Window::Dimension(0, 1, 1)); + out_window.set(3, Window::Dimension(0, 1, 1)); + + f(in_window, out_window, input, output, 3, op); + } +}; + +template <typename T, int S> +struct RedOpX +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + { + const size_t input_dim_0 = in->info()->dimension(0); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(in_window.x().start()); + const auto window_end_x = static_cast<int>(in_window.x().end()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_window); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + + auto init_res_value = static_cast<T>(0.f); + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + init_res_value = static_cast<T>(*input_ptr); + break; + } + case ReductionOperation::PROD: + { + init_res_value = static_cast<T>(1.f); + break; + } + default: + break; + } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{{0}}; + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM_SQUARE: + { +#ifdef ARM_COMPUTE_DEBUG_ENABLED + auto res = static_cast<T>(0.f); + for (int i = 0; i < S; ++i) + { + res += wrapper::vgetlane(vec_res_value, i); + } +#else // ARM_COMPUTE_DEBUG_ENABLED + auto carry_res = + wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for (int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); +#endif // ARM_COMPUTE_DEBUG_ENABLED + if (op == ReductionOperation::SUM_SQUARE) + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += (*(input_ptr + x)) * (*(input_ptr + x)); + } + } + else + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res /= input_dim_0; + } + + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::PROD: + { + auto carry_res = + wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + T res = 1; + for (int i = 0; i < S / 2; ++i) + { + res *= wrapper::vgetlane(carry_res, i); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res *= *(input_ptr + x); + } + + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::MIN: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::MAX: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input, output); + } +}; + +template <typename T> +struct RedOpX_quantized +{ + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + { + using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + + const auto oq_info = out->info()->quantization_info().uniform(); + + const TensorInfo in_info = *(in->info()); + const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(in_window.x().start()); + const auto window_end_x = static_cast<int>(in_window.x().end()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_window); + + const auto in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const auto out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const auto num_elements = static_cast<float>(in_info.dimension(0)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + auto vec_res_value1 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = + wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{}); + + auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f)); + auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f)); + + typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0}; + + if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || + op == ReductionOperation::MIN || op == ReductionOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); + } + + uint32x4x4_t vec_res_idx{{0}}; + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::ARG_IDX_MIN: + { + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto idx = + calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op); + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast<uint32_t *>(output.ptr())) = idx; + break; + } + case ReductionOperation::MIN: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::MAX: + { + auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast<T *>(output.ptr())) = res; + break; + } + case ReductionOperation::PROD: + { + auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); + carry_res = wrapper::vmul(carry_res, vec_res_value3_f); + carry_res = wrapper::vmul(carry_res, vec_res_value4_f); + + float res = wrapper::vgetlane(carry_res, 0); + res *= wrapper::vgetlane(carry_res, 1); + res *= wrapper::vgetlane(carry_res, 2); + res *= wrapper::vgetlane(carry_res, 3); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + } + else + { + res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + } + } + + //re-quantize result + if (std::is_same<T, uint8_t>::value) + { + res = quantize_qasymm8(res, iq_info); + } + else + { + res = quantize_qasymm8_signed(res, iq_info); + } + + *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res); + break; + } + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); + + auto carry_paddition = + wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + + if (op == ReductionOperation::MEAN_SUM) + { + const int32_t resFinal = A * (static_cast<float>(res)) + B; + + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal); + } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res); + } + + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + }, + input, output); + } +}; + +template <typename T, int S> +struct RedOpYZW +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) + { + const TensorInfo in_info = *(in->info()); + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + neon_vector vec_res_value = {0}; + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vloadq(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); + break; + } + default: + { + vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + break; + } + } + uint32x4x4_t vec_res_idx{{0}}; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + auto vec_width_inv = + wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{})); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); + } + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (std::is_same<T, float16_t>::value) + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + } + else + { + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto res_value = 0.f; + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + res_value = *(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + res_value = static_cast<T>(1.f); + break; + } + default: + { + res_value = static_cast<T>(0.f); + break; + } + } + + uint32_t res_idx = 0; + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + res_value += *in_ptr; + break; + case ReductionOperation::SUM_SQUARE: + res_value += *in_ptr * *in_ptr; + break; + case ReductionOperation::PROD: + res_value *= *in_ptr; + break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res_value /= in_info.dimension(axis); + } + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx; + } + else + { + *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value; + } + } + }, + input, output); + } +}; + +template <typename T, int S, int axis, ReductionOperation op> +struct RedOpYZW_complex +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + using neon_vector = typename wrapper::traits::neon_vector<T, S>::type; + + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) + { + ARM_COMPUTE_ERROR_ON(axis != 2); + ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); + + const TensorInfo in_info = *(in->info()); + const size_t stride_z = in_info.strides_in_bytes()[axis]; + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + neon_vector vec_res_value_0 = {0}; + neon_vector vec_res_value_1 = {0}; + + vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); + + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); + + const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); + const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); + + vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); + vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); + } + + wrapper::vstore(out_ptr, vec_res_value_0); + wrapper::vstore(out_ptr + 4, vec_res_value_1); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto res_value_0 = 0.f; + auto res_value_1 = 0.f; + + T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + res_value_0 += *in_ptr; + res_value_1 += *(in_ptr + 1); + } + *out_ptr = res_value_0; + *(out_ptr + 1) = res_value_1; + } + }, + input, output); + } +}; + +template <typename T> +struct RedOpYZW_quantized +{ + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) + { + const TensorInfo in_info = *(in->info()); + const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); + using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type; + + const auto oq_info = out->info()->quantization_info().uniform(); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x_tmp = static_cast<int>(in_window.x().start()); + const auto window_end_x_tmp = static_cast<int>(in_window.x().end()); + // As it split over x-axis, need to set the correct spiltted window start and end. + const auto window_start_x = static_cast<int>(0); + const auto window_end_x = static_cast<int>(in_window.shape().x()); + + Window in_win_no_pad = in_window; + in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); + Window out_win_no_pad = out_window; + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + + Iterator input(in, in_win_no_pad); + Iterator output(out, out_win_no_pad); + + using vector_type = + typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type; + using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type; + + vector_type vec_res_value1{}; + vector_type vec_res_value2{}; + vector_type vec_res_value3{}; + vector_type vec_res_value4{}; + + vector_type_f vec_res_value1_f{}; + vector_type_f vec_res_value2_f{}; + vector_type_f vec_res_value3_f{}; + vector_type_f vec_res_value4_f{}; + + const float in_offset = static_cast<float>(iq_info.offset); + const float in_scale = iq_info.scale; + + const float out_offset = static_cast<float>(oq_info.offset); + const float out_scale = oq_info.scale; + + const float num_elements = static_cast<float>(in_info.dimension(axis)); + + const float A = in_scale / (out_scale * num_elements); + const float B = out_offset - (in_scale * in_offset) / (out_scale); + + const auto vec_A = wrapper::vdup_n(static_cast<float>(A), wrapper::traits::vector_128_tag{}); + const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{}); + + execute_window_loop( + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<T *>(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + uint32x4x4_t vec_res_idx{{0}}; + vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{}); + + vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{}); + + auto vec_res_value = wrapper::vloadq(input_ptr + x); + + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) + { + const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), + wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = + wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: + { + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, + vec_res_idx.val[3]); + break; + } + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value); + break; + } + case ReductionOperation::SUM: + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + + auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); + auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); + auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); + auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); + + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + + combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x); + break; + } + case ReductionOperation::MEAN_SUM: + { + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A); + +#ifdef __aarch64__ + vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f); +#else // defined(__aarch64__) + vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f); +#endif // __aarch64__ + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = + wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); + + //re-quantize + vec_res_value1_f = + wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = + wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = + wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = + wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + + vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + float res_value = 0.f; + int32_t res_value_q = 0; + + switch (op) + { + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: + { + res_value = *(input_ptr + x); + break; + } + case ReductionOperation::PROD: + { + res_value = static_cast<T>(1.0f); + break; + } + default: + { + res_value = static_cast<T>(0.0f); + break; + } + } + uint32_t res_idx = 0; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); + switch (op) + { + case ReductionOperation::SUM: + { + res_value += *in_ptr; + break; + } + case ReductionOperation::MEAN_SUM: + { + res_value_q += *in_ptr; + break; + } + case ReductionOperation::SUM_SQUARE: + { + res_value += *in_ptr * *in_ptr; + break; + } + case ReductionOperation::PROD: + { + //de-quantize input + if (std::is_same<T, uint8_t>::value) + { + res_value *= dequantize_qasymm8(*in_ptr, iq_info); + } + else + { + res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + } + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } + } + + switch (op) + { + case ReductionOperation::MEAN_SUM: + { + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = + arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B); +#else // defined(__aarch64__) + const int32_t res = A * (static_cast<float>(res_value_q)) + B; +#endif // __aarch64__ + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res); + break; + } + case ReductionOperation::SUM: + { + // Subtract accumulated offsets + res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; + *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value); + break; + } + case ReductionOperation::PROD: + { + //re-quantize result + T res = 0; + if (std::is_same<T, uint8_t>::value) + { + res = quantize_qasymm8(res_value, iq_info); + } + else + { + res = quantize_qasymm8_signed(res_value, iq_info); + } + *(reinterpret_cast<T *>(output.ptr() + x)) = res; + break; + } + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: + { + *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx; + break; + } + default: + *(reinterpret_cast<T *>(output.ptr() + x)) = res_value; + } + } + }, + input, output); + } +}; + +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp new file mode 100644 index 0000000000..ad66b456ac --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/integer.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op); +} + +void reduce_RedOpYZW_reduceY_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op); +} +void reduce_RedOpYZW_reduceZ_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op); +} + +void reduce_RedOpYZW_reduceW_S32_4(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/list.h b/src/cpu/kernels/reduction_layer/generic/neon/list.h new file mode 100644 index 0000000000..947c28a130 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/list.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ + +#define DECLARE_REDUCTION_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *in, ITensor *out, const ReductionOperation op) + +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_complex_reduceZ_float32_4_2_SUM); +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float32_4); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_float16_8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_float16_8); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_S32_4); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_S32_4); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8); + +DECLARE_REDUCTION_KERNEL(reduce_RedOpX_reduceX_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceY_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceZ_qasymm8_signed); +DECLARE_REDUCTION_KERNEL(reduce_RedOpYZW_reduceW_qasymm8_signed); + +#undef DECLARE_REDUCTION_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_REDUCTION_LAYER_GENERIC_NEON_LIST_H diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..bc711c6855 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceY_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceZ_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} + +void reduce_RedOpYZW_reduceW_qasymm8(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..10ac3d6715 --- /dev/null +++ b/src/cpu/kernels/reduction_layer/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/reduction_layer/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void reduce_RedOpX_reduceX_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceY_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceZ_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} + +void reduce_RedOpYZW_reduceW_qasymm8_signed(const Window &window, + const ITensor *input, + ITensor *output, + const ReductionOperation op) +{ + return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp new file mode 100644 index 0000000000..cf99830562 --- /dev/null +++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/roialign/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_fp16_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) +{ + return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp new file mode 100644 index 0000000000..c1dba99b5e --- /dev/null +++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/roialign/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_fp32_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) +{ + return roi_align<float, float>(input, output, rois, pool_info, window, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h new file mode 100644 index 0000000000..db2f67705d --- /dev/null +++ b/src/cpu/kernels/roialign/generic/neon/impl.h @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2019-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H +#define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H +#include "arm_compute/core/CPP/CPPTypes.h" +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +class ITensor; +class Window; +namespace cpu +{ +/** Average pooling over an aligned window */ +template <typename input_data_type> +inline input_data_type roi_align_1x1(const ITensor *input, + unsigned int roi_batch, + float region_start_x, + float bin_size_x, + int grid_size_x, + float region_end_x, + float region_start_y, + float bin_size_y, + int grid_size_y, + float region_end_y, + int pz) +{ + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + { + return input_data_type(0); + } + else + { + const DataLayout data_layout = input->info()->data_layout(); + float avg = 0; + // Iterate through the aligned pooling region + for (int iy = 0; iy < grid_size_y; ++iy) + { + for (int ix = 0; ix < grid_size_x; ++ix) + { + // Align the window in the middle of every bin + float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); + float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x); + + // Interpolation in the [0,0] [0,1] [1,0] [1,1] square + const int y_low = y; + const int x_low = x; + const int y_high = y_low + 1; + const int x_high = x_low + 1; + + const float ly = y - y_low; + const float lx = x - x_low; + const float hy = 1. - ly; + const float hx = 1. - lx; + + const float w1 = hy * hx; + const float w2 = hy * lx; + const float w3 = ly * hx; + const float w4 = ly * lx; + if (data_layout == DataLayout::NCHW) + { + const auto data1 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); + const auto data2 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); + const auto data3 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); + const auto data4 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + else + { + const auto data1 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); + const auto data2 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); + const auto data3 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); + const auto data4 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + } + } + + avg /= grid_size_x * grid_size_y; + return input_data_type(avg); + } +} + +/** Average pooling over an aligned window */ +template <typename input_data_type> +inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, + unsigned int roi_batch, + float region_start_x, + float bin_size_x, + int grid_size_x, + float region_end_x, + float region_start_y, + float bin_size_y, + int grid_size_y, + float region_end_y, + int pz, + const QuantizationInfo &out_qinfo) +{ + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + { + return input_data_type(out_qinfo.uniform().offset); + } + else + { + float avg = 0; + const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); + const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); + const DataLayout data_layout = input->info()->data_layout(); + + // Iterate through the aligned pooling region + for (int iy = 0; iy < grid_size_y; ++iy) + { + for (int ix = 0; ix < grid_size_x; ++ix) + { + // Align the window in the middle of every bin + float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); + float x = region_start_x + (ix + 0.5) * bin_size_x / float(grid_size_x); + + // Interpolation in the [0,0] [0,1] [1,0] [1,1] square + const int y_low = y; + const int x_low = x; + const int y_high = y_low + 1; + const int x_high = x_low + 1; + + const float ly = y - y_low; + const float lx = x - x_low; + const float hy = 1. - ly; + const float hx = 1. - lx; + + const float w1 = hy * hx; + const float w2 = hy * lx; + const float w3 = ly * hx; + const float w4 = ly * lx; + + if (data_layout == DataLayout::NCHW) + { + if (is_qasymm_signed) + { + float data1 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + else + { + float data1 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + } + else + { + if (is_qasymm_signed) + { + const auto data1 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + else + { + const auto data1 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); + avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; + } + } + } + } + + avg /= grid_size_x * grid_size_y; + + input_data_type res = 0; + if (is_qasymm_signed) + { + res = quantize_qasymm8_signed(avg, out_qinfo); + } + else + { + res = quantize_qasymm8(avg, out_qinfo); + } + return res; + } +} +inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, float max_value) +{ + const float region_start = p * bin_size + roi_anchor; + return utility::clamp(region_start, 0.0f, max_value); +} + +template <typename input_data_type, typename roi_data_type> +void roi_align(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + const DataLayout data_layout = input->info()->data_layout(); + const size_t values_per_roi = rois->info()->dimension(0); + + const int roi_list_start = window.x().start(); + const int roi_list_end = window.x().end(); + + const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + const int input_width = input->info()->dimension(idx_width); + const int input_height = input->info()->dimension(idx_height); + const int input_chanels = input->info()->dimension(idx_depth); + const int pooled_w = pool_info.pooled_width(); + const int pooled_h = pool_info.pooled_height(); + + const DataType data_type = input->info()->data_type(); + const bool is_qasymm = is_data_type_quantized_asymmetric(data_type); + + const auto *rois_ptr = reinterpret_cast<const roi_data_type *>(rois->buffer()); + const QuantizationInfo &rois_qinfo = rois->info()->quantization_info(); + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + { + const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; + + roi_data_type qx1 = rois_ptr[values_per_roi * roi_indx + 1]; + roi_data_type qy1 = rois_ptr[values_per_roi * roi_indx + 2]; + roi_data_type qx2 = rois_ptr[values_per_roi * roi_indx + 3]; + roi_data_type qy2 = rois_ptr[values_per_roi * roi_indx + 4]; + float x1(qx1); + float x2(qx2); + float y1(qy1); + float y2(qy2); + if (is_qasymm) + { + x1 = dequantize_qasymm16(qx1, rois_qinfo); + x2 = dequantize_qasymm16(qx2, rois_qinfo); + y1 = dequantize_qasymm16(qy1, rois_qinfo); + y2 = dequantize_qasymm16(qy2, rois_qinfo); + } + const float roi_anchor_x = x1 * pool_info.spatial_scale(); + const float roi_anchor_y = y1 * pool_info.spatial_scale(); + const float roi_dims_x = std::max((x2 - x1) * pool_info.spatial_scale(), 1.0f); + const float roi_dims_y = std::max((y2 - y1) * pool_info.spatial_scale(), 1.0f); + float bin_size_x = roi_dims_x / pool_info.pooled_width(); + float bin_size_y = roi_dims_y / pool_info.pooled_height(); + + // Iterate through all feature maps + for (int ch = 0; ch < input_chanels; ++ch) + { + // Iterate through all output pixels + for (int py = 0; py < pooled_h; ++py) + { + for (int px = 0; px < pooled_w; ++px) + { + const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); + const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); + const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); + const float region_end_y = + compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); + const int roi_bin_grid_x = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x)); + const int roi_bin_grid_y = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y)); + input_data_type out_val(0); + if (is_qasymm) + { + out_val = roi_align_1x1_qasymm8<input_data_type>( + input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info()); + } + else + { + out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x, + roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch); + } + + if (data_layout == DataLayout::NCHW) + { + auto out_ptr = reinterpret_cast<input_data_type *>( + output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); + *out_ptr = out_val; + } + else + { + auto out_ptr = reinterpret_cast<input_data_type *>( + output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); + *out_ptr = out_val; + } + } + } + } + } +} +} // namespace cpu +} // namespace arm_compute +#endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..11c5770f53 --- /dev/null +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/roialign/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qu8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) +{ + return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..7f93cc87b3 --- /dev/null +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/roialign/generic/neon/impl.h" +namespace arm_compute +{ +namespace cpu +{ +void neon_qs8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) +{ + return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h new file mode 100644 index 0000000000..fdb3c0050d --- /dev/null +++ b/src/cpu/kernels/roialign/list.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H +#define SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ROIALIGN_KERNEL(func_name) \ + void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \ + const Window &window, const ThreadInfo &info) +DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign); +DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign); +DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign); +DECLARE_ROIALIGN_KERNEL(neon_qs8_roialign); +#undef DECLARE_ROIALIGN_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CORE_NEON_KERNELS_ROIALIGN_LIST_H */ diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp new file mode 100644 index 0000000000..c8a7b7038e --- /dev/null +++ b/src/cpu/kernels/scale/neon/fp16.cpp @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/cpu/kernels/scale/neon/list.h" +#include "support/Rounding.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void fp16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 8; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void fp16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if (border_mode == BorderMode::CONSTANT) + { + using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type; + + const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float16_t *in_ptr = + reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<float16_t *>(out.ptr()) = + static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if (border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast<float16_t *>(out.ptr()) = + static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace +namespace cpu +{ +#ifdef ENABLE_NCHW_KERNELS +void fp16_bilinear_neon_scale_nchw(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + arm_compute::cpu::scale_bilinear_nchw<float16_t>(src, dst, dx, dy, offsets, border_mode, constant_border_value, + sampling_offset, align_corners, window); +} + +void fp16_nearest_neon_scale_nchw(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_UNUSED(border_mode); + arm_compute::cpu::scale_nearest_nchw<float16_t>(src, dst, dx, dy, offsets, constant_border_value, sampling_offset, + align_corners, window); +} +#endif // ENABLE_NCHW_KERNELS +void fp16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} + +void fp16_common_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + arm_compute::cpu::common_neon_scale<float16_t>(src, dst, offsets, dx, dy, policy, border_mode, + constant_border_value, sampling_offset, align_corners, window); +} + +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp new file mode 100644 index 0000000000..bbf92e0412 --- /dev/null +++ b/src/cpu/kernels/scale/neon/integer.cpp @@ -0,0 +1,783 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace +{ +void u8_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 16; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void u8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + const int input_width = src->info()->dimension(1); + const int input_height = src->info()->dimension(2); + + if (border_mode == BorderMode::CONSTANT) + { + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_stride_wc = + in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset); + const uint8_t *in_ptr = + reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) + ? *in_ptr + : const_border_value; + const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<uint8_t *>(out.ptr()) = + static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if (border_mode == BorderMode::REPLICATE) + { + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; + + const int in_stride_x = src->info()->strides_in_bytes()[1]; + const int in_stride_y = src->info()->strides_in_bytes()[2]; + const int in_stride_b = src->info()->strides_in_bytes()[3]; + const int out_stride_x = dst->info()->strides_in_bytes()[1]; + const int out_stride_y = dst->info()->strides_in_bytes()[2]; + const int out_stride_b = dst->info()->strides_in_bytes()[3]; + + const int out_dim_ch = dst->info()->dimension(0); + constexpr int step_cout = 16; + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution[1].start(); + const int xo_end = window_execution[1].end(); + const int xo_step = window_execution[1].step(); + const int yo_start = window_execution[2].start(); + const int yo_end = window_execution[2].end(); + const int yo_step = window_execution[2].step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + const float fp_coord_offset_y = sampling_offset * (scale_y - 1); + const float fp_coord_offset_x = sampling_offset * (scale_x - 1); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; + uint8_t *out_ptr = out.ptr() + bo * out_stride_b; + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = yo * scale_y + fp_coord_offset_y; + // Integer coordinate + const int yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const float a1 = (yi_f - static_cast<float>(yi)); + const float b1 = (1.f - a1); + + const int yi0 = utility::clamp<int>(yi, 0, input_height - 1); + const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1); + + const uint8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y; + const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; + + uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = xo * scale_x + fp_coord_offset_x; + // Integer coordinate + const int xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const float a = (xi_f - static_cast<float>(xi)); + const float b = (1.f - a); + + const float s00_s = b * b1; + const float s01_s = a * b1; + const float s10_s = b * a1; + const float s11_s = a * a1; + + const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{}); + + const int xi0 = utility::clamp<int>(xi, 0, input_width - 1); + const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1); + + const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x; + const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x; + + uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); + const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); + const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(uint8_t)); + const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(uint8_t)); + + const uint16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); + const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); + + const auto in00_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_low))); + const auto in00_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_low))); + const auto in00_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_high))); + const auto in00_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_high))); + + const uint16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); + const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); + + const auto in01_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_low))); + const auto in01_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_low))); + const auto in01_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_high))); + const auto in01_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_high))); + + const uint16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); + const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); + + const auto in10_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_low))); + const auto in10_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_low))); + const auto in10_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_high))); + const auto in10_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_high))); + + const uint16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); + const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); + + const auto in11_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_low))); + const auto in11_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_low))); + const auto in11_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_high))); + const auto in11_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_high))); + + auto out_0 = wrapper::vmul(in00_0, s00); + out_0 = wrapper::vmla(out_0, in01_0, s01); + out_0 = wrapper::vmla(out_0, in10_0, s10); + out_0 = wrapper::vmla(out_0, in11_0, s11); + + auto out_1 = wrapper::vmul(in00_1, s00); + out_1 = wrapper::vmla(out_1, in01_1, s01); + out_1 = wrapper::vmla(out_1, in10_1, s10); + out_1 = wrapper::vmla(out_1, in11_1, s11); + + auto out_2 = wrapper::vmul(in00_2, s00); + out_2 = wrapper::vmla(out_2, in01_2, s01); + out_2 = wrapper::vmla(out_2, in10_2, s10); + out_2 = wrapper::vmla(out_2, in11_2, s11); + + auto out_3 = wrapper::vmul(in00_3, s00); + out_3 = wrapper::vmla(out_3, in01_3, s01); + out_3 = wrapper::vmla(out_3, in10_3, s10); + out_3 = wrapper::vmla(out_3, in11_3, s11); + +#if defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvta<uint32_t>(out_0); + const auto out_1_int = wrapper::vcvta<uint32_t>(out_1); + const auto out_2_int = wrapper::vcvta<uint32_t>(out_2); + const auto out_3_int = wrapper::vcvta<uint32_t>(out_3); +#else // defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvt<uint32_t>(out_0); + const auto out_1_int = wrapper::vcvt<uint32_t>(out_1); + const auto out_2_int = wrapper::vcvt<uint32_t>(out_2); + const auto out_3_int = wrapper::vcvt<uint32_t>(out_3); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); + + wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); + } + + for (; cout < out_dim_ch; ++cout) + { + const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); + const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); + const uint8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(uint8_t)); + const uint8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(uint8_t)); + + float out0 = in00 * s00_s; + out0 += in01 * s01_s; + out0 += in10 * s10_s; + out0 += in11 * s11_s; + + // Rounding modes of vector and scalar loops should match +#if defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = static_cast<uint8_t>(std::round(out0)); +#else // defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = static_cast<uint8_t>(out0); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + } + } + } + } + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void s8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value); + if (border_mode == BorderMode::REPLICATE) + { + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; + + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + const int in_stride_x = src->info()->strides_in_bytes()[1]; + const int in_stride_y = src->info()->strides_in_bytes()[2]; + const int in_stride_b = src->info()->strides_in_bytes()[3]; + const int out_stride_x = dst->info()->strides_in_bytes()[1]; + const int out_stride_y = dst->info()->strides_in_bytes()[2]; + const int out_stride_b = dst->info()->strides_in_bytes()[3]; + const int input_width = src->info()->dimension(1); + const int input_height = src->info()->dimension(2); + const int out_dim_ch = dst->info()->dimension(0); + constexpr int step_cout = 16; + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution[1].start(); + const int xo_end = window_execution[1].end(); + const int xo_step = window_execution[1].step(); + const int yo_start = window_execution[2].start(); + const int yo_end = window_execution[2].end(); + const int yo_step = window_execution[2].step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + const float fp_coord_offset_y = sampling_offset * (scale_y - 1); + const float fp_coord_offset_x = sampling_offset * (scale_x - 1); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b); + int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b); + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = yo * scale_y + fp_coord_offset_y; + // Integer coordinate + const int yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const float a1 = (yi_f - static_cast<float>(yi)); + const float b1 = (1.f - a1); + + const int yi0 = utility::clamp<int>(yi, 0, input_height - 1); + const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1); + + const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y; + const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; + + int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = xo * scale_x + fp_coord_offset_x; + // Integer coordinate + const int xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const float a = (xi_f - static_cast<float>(xi)); + const float b = (1.f - a); + + const float s00_s = b * b1; + const float s01_s = a * b1; + const float s10_s = b * a1; + const float s11_s = a * a1; + + const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{}); + + const int xi0 = utility::clamp<int>(xi, 0, input_width - 1); + const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1); + + const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x; + const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x; + + int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); + const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); + const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(int8_t)); + const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(int8_t)); + + const int16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); + const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); + + const auto in00_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_low))); + const auto in00_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_low))); + const auto in00_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in00_high))); + const auto in00_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in00_high))); + + const int16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); + const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); + + const auto in01_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_low))); + const auto in01_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_low))); + const auto in01_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in01_high))); + const auto in01_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in01_high))); + + const int16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); + const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); + + const auto in10_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_low))); + const auto in10_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_low))); + const auto in10_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in10_high))); + const auto in10_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in10_high))); + + const int16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); + const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); + + const auto in11_0 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_low))); + const auto in11_1 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_low))); + const auto in11_2 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgetlow(in11_high))); + const auto in11_3 = wrapper::vcvt<float>(wrapper::vmovl(wrapper::vgethigh(in11_high))); + + auto out_0 = wrapper::vmul(in00_0, s00); + out_0 = wrapper::vmla(out_0, in01_0, s01); + out_0 = wrapper::vmla(out_0, in10_0, s10); + out_0 = wrapper::vmla(out_0, in11_0, s11); + + auto out_1 = wrapper::vmul(in00_1, s00); + out_1 = wrapper::vmla(out_1, in01_1, s01); + out_1 = wrapper::vmla(out_1, in10_1, s10); + out_1 = wrapper::vmla(out_1, in11_1, s11); + + auto out_2 = wrapper::vmul(in00_2, s00); + out_2 = wrapper::vmla(out_2, in01_2, s01); + out_2 = wrapper::vmla(out_2, in10_2, s10); + out_2 = wrapper::vmla(out_2, in11_2, s11); + + auto out_3 = wrapper::vmul(in00_3, s00); + out_3 = wrapper::vmla(out_3, in01_3, s01); + out_3 = wrapper::vmla(out_3, in10_3, s10); + out_3 = wrapper::vmla(out_3, in11_3, s11); + +#if defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvta<int32_t>(out_0); + const auto out_1_int = wrapper::vcvta<int32_t>(out_1); + const auto out_2_int = wrapper::vcvta<int32_t>(out_2); + const auto out_3_int = wrapper::vcvta<int32_t>(out_3); +#else // defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvt<int32_t>(out_0); + const auto out_1_int = wrapper::vcvt<int32_t>(out_1); + const auto out_2_int = wrapper::vcvt<int32_t>(out_2); + const auto out_3_int = wrapper::vcvt<int32_t>(out_3); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); + + wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); + } + + for (; cout < out_dim_ch; ++cout) + { + const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); + const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); + const int8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(int8_t)); + const int8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(int8_t)); + + float out0 = in00 * s00_s; + out0 += in01 * s01_s; + out0 += in10 * s10_s; + out0 += in11 * s11_s; + + // Rounding modes of vector and scalar loops should match +#if defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = static_cast<int8_t>(std::round(out0)); +#else // defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = static_cast<int8_t>(out0); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + } + } + } + } + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void s16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 8; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void s16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if (border_mode == BorderMode::CONSTANT) + { + const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const int16_t *in_ptr = + reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<int16_t *>(out.ptr()) = + static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if (border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + const auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = + *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast<int16_t *>(out.ptr()) = + static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace +namespace cpu +{ +void s8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void u8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} + +void s16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h new file mode 100644 index 0000000000..153dc67c3d --- /dev/null +++ b/src/cpu/kernels/scale/neon/list.h @@ -0,0 +1,617 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) + +DECLARE_SCALE_KERNEL(s16_neon_scale); +DECLARE_SCALE_KERNEL(u8_neon_scale); +DECLARE_SCALE_KERNEL(s8_neon_scale); +DECLARE_SCALE_KERNEL(qasymm8_neon_scale); +DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale); +DECLARE_SCALE_KERNEL(fp16_common_neon_scale); +DECLARE_SCALE_KERNEL(fp16_bilinear_neon_scale_nchw); +DECLARE_SCALE_KERNEL(fp16_nearest_neon_scale_nchw); + +#undef DECLARE_SCALE_KERNEL + +#ifdef ENABLE_NCHW_KERNELS +template <typename T> +void scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy); + ARM_COMPUTE_UNUSED(constant_border_value); + const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + + // Compute the ratio between source height and destination height + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Set offsets window + Window win_off; + win_off.set(Window::DimX, window[Window::DimX]); + win_off.set(Window::DimY, window[Window::DimY]); + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + // Create iterators + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + Iterator offsets_i(offsets, win_off); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr()); + const auto in_yi = static_cast<int32_t>( + align_corners ? utils::rounding::round_half_away_from_zero((id.y() + sampling_offset) * hr) + : std::floor((id.y() + sampling_offset) * hr)); + const int32_t offset_row = in_yi * in_stride_x; + *reinterpret_cast<T *>(dst_i.ptr()) = + *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row); + }, + src_i, offsets_i, dst_i); +} + +template <typename T> +void scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + Window win_off; + win_off.set(Window::DimX, window.x()); + win_off.set(Window::DimY, window.y()); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + Iterator offsets_i(offsets, win_off); + Iterator dx_i(dx, win_off); + Iterator dy_i(dy, win_off); + + const int32_t in_dim_w = src->info()->dimension(0); + const int32_t in_dim_h = src->info()->dimension(1); + const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right; + + if (border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id.y() + sampling_offset) * hr - sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) + : const_border_value; + + *reinterpret_cast<T *>(dst_i.ptr()) = + static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); + } + else if (border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id.y() + sampling_offset) * hr - sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); + const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); + const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); + const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); + + *reinterpret_cast<T *>(dst_i.ptr()) = + static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +#endif // ENABLE_NCHW_KERNELS + +template <typename T> +void nearest_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(offsets); + + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + const int in_stride_y = src->info()->strides_in_bytes()[1]; + const int in_stride_z = src->info()->strides_in_bytes()[2]; + const int in_stride_w = src->info()->strides_in_bytes()[3]; + const int out_stride_y = dst->info()->strides_in_bytes()[1]; + const int out_stride_z = dst->info()->strides_in_bytes()[2]; + const int out_stride_w = dst->info()->strides_in_bytes()[3]; + const int out_dim_ch = dst->info()->dimension(0); + const int step_cout = 16 / sizeof(T); + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution.y().start(); + const int xo_end = window_execution.y().end(); + const int xo_step = window_execution.y().step(); + const int yo_start = window_execution.z().start(); + const int yo_end = window_execution.z().end(); + const int yo_step = window_execution.z().step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; + uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + float yi_f = ((yo + sampling_offset) * scale_y); + int yi = 0; + if (align_corners) + { + yi = utils::rounding::round_half_away_from_zero(yi_f); + } + else + { + yi = static_cast<int>(std::floor(yi_f)); + } + + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + float xi_f = ((xo + sampling_offset) * scale_x); + int xi = 0; + if (align_corners) + { + xi = utils::rounding::round_half_away_from_zero(xi_f); + } + else + { + xi = static_cast<int>(std::floor(xi_f)); + } + + const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z; + uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); + wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0); + } + + for (; cout < out_dim_ch; ++cout) + { + auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); + *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0; + } + } + } + } +} + +template <typename T> +void bilinear_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(offsets); + ARM_COMPUTE_UNUSED(dx); + ARM_COMPUTE_UNUSED(dy); + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + const int in_stride_y = src->info()->strides_in_bytes()[1]; + const int in_stride_z = src->info()->strides_in_bytes()[2]; + const int in_stride_w = src->info()->strides_in_bytes()[3]; + const int out_stride_y = dst->info()->strides_in_bytes()[1]; + const int out_stride_z = dst->info()->strides_in_bytes()[2]; + const int out_stride_w = dst->info()->strides_in_bytes()[3]; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int out_dim_ch = dst->info()->dimension(0); + const int step_cout = 16 / sizeof(T); + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution.y().start(); + const int xo_end = window_execution.y().end(); + const int xo_step = window_execution.y().step(); + const int yo_start = window_execution.z().start(); + const int yo_end = window_execution.z().end(); + const int yo_step = window_execution.z().step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + if (border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>()); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; + uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); + // Integer coordinate + const auto yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const auto a1 = (yi_f - static_cast<float>(yi)); + const auto b1 = (1.f - a1); + + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); + // Integer coordinate + const auto xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const auto a = (xi_f - static_cast<float>(xi)); + const auto b = (1.f - a); + + const auto s00_s = static_cast<T>(b * b1); + const auto s01_s = static_cast<T>(a * b1); + const auto s10_s = static_cast<T>(b * a1); + const auto s11_s = static_cast<T>(a * a1); + + const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z; + uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); + auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); + auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); + auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); + if ((yi >= 0) && (yi < in_dim_h)) + { + if ((xi >= 0) && (xi < in_dim_w)) + { + in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); + } + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + { + in01 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y)); + } + } + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + { + if ((xi >= 0) && (xi < in_dim_w)) + { + in10 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z)); + } + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + { + in11 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + } + } + + const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{}); + auto out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + out0 = wrapper::vmla(out0, in00, s00); + out0 = wrapper::vmla(out0, in01, s01); + out0 = wrapper::vmla(out0, in10, s10); + out0 = wrapper::vmla(out0, in11, s11); + wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0); + } + + for (; cout < out_dim_ch; ++cout) + { + auto in00 = static_cast<T>(const_border_value); + auto in01 = static_cast<T>(const_border_value); + auto in10 = static_cast<T>(const_border_value); + auto in11 = static_cast<T>(const_border_value); + if ((yi >= 0) && (yi < in_dim_h)) + { + if ((xi >= 0) && (xi < in_dim_w)) + { + in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); + } + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + { + in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y)); + } + } + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + { + if ((xi >= 0) && (xi < in_dim_w)) + { + in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z)); + } + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + { + in11 = *( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + } + } + auto out0 = static_cast<T>(0); + out0 += in00 * s00_s; + out0 += in01 * s01_s; + out0 += in10 * s10_s; + out0 += in11 * s11_s; + *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0; + } + } + } + } + } + else if (border_mode == BorderMode::REPLICATE) + { + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const uint8_t *in_ptr = in.ptr() + bo * in_stride_w; + uint8_t *out_ptr = out.ptr() + bo * out_stride_w; + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); + // Integer coordinate + const auto yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const auto a1 = (yi_f - static_cast<float>(yi)); + const auto b1 = (1.f - a1); + + const int yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1); + const int yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1); + + const int yi0_offset = yi0 * in_stride_z; + const int yi1_offset = yi1 * in_stride_z; + + const int y_offset = yo * out_stride_z; + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); + // Integer coordinate + const auto xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const auto a = (xi_f - static_cast<float>(xi)); + const auto b = (1.f - a); + + const auto s00_s = static_cast<T>(b * b1); + const auto s01_s = static_cast<T>(a * b1); + const auto s10_s = static_cast<T>(b * a1); + const auto s11_s = static_cast<T>(a * a1); + + const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{}); + + const int xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1); + const int xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1); + + const int xi0_offset = xi0 * in_stride_y; + const int xi1_offset = xi1 * in_stride_y; + + const int offset = xo * out_stride_y + y_offset; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + const auto in00 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const auto in01 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const auto in10 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const auto in11 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + + auto out0 = wrapper::vmul(in00, s00); + out0 = wrapper::vmla(out0, in01, s01); + out0 = wrapper::vmla(out0, in10, s10); + out0 = wrapper::vmla(out0, in11, s11); + wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0); + } + + for (; cout < out_dim_ch; ++cout) + { + const T in00 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const T in01 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const T in10 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const T in11 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + + T out0 = in00 * s00_s; + out0 += in01 * s01_s; + out0 += in10 * s10_s; + out0 += in11 * s11_s; + *(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T))) = out0; + } + } + } + } + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +template <typename T> +void common_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp new file mode 100644 index 0000000000..62a821daa5 --- /dev/null +++ b/src/cpu/kernels/scale/neon/qasymm8.cpp @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/helpers/ScaleHelpers.h" +#include "src/cpu/kernels/scale/neon/list.h" + +namespace arm_compute +{ +namespace +{ +void qasymm8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Data layout is NHWC + const int32_t input_width = src->info()->dimension(1); + const int32_t input_height = src->info()->dimension(2); + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + if (border_mode == BorderMode::CONSTANT) + { + const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; + const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; + + // Compute the ratio between source height and destination height + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(1, Window::Dimension(0, 0, 0)); + win_in.set(2, Window::Dimension(0, 0, 0)); + + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if (border_mode == BorderMode::REPLICATE) + { + using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; + using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; + + const int in_stride_x = src->info()->strides_in_bytes()[1]; + const int in_stride_y = src->info()->strides_in_bytes()[2]; + const int in_stride_b = src->info()->strides_in_bytes()[3]; + const int out_stride_x = dst->info()->strides_in_bytes()[1]; + const int out_stride_y = dst->info()->strides_in_bytes()[2]; + const int out_stride_b = dst->info()->strides_in_bytes()[3]; + const int out_dim_ch = dst->info()->dimension(0); + constexpr int step_cout = 16; + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution[1].start(); + const int xo_end = window_execution[1].end(); + const int xo_step = window_execution[1].step(); + const int yo_start = window_execution[2].start(); + const int yo_end = window_execution[2].end(); + const int yo_step = window_execution[2].step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale_in = wrapper::vdup_n(iq_info.scale, FloatTagType{}); + const int32x4_t voffset_in = wrapper::vdup_n(iq_info.offset, Int32TagType{}); // Offsets will be Int32 + + const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{}); + const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset); + + const float fp_coord_offset_y = sampling_offset * (scale_y - 1); + const float fp_coord_offset_x = sampling_offset * (scale_x - 1); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; + uint8_t *out_ptr = out.ptr() + bo * out_stride_b; + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = yo * scale_y + fp_coord_offset_y; + // Integer coordinate + const int yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const float a1 = (yi_f - static_cast<float>(yi)); + const float b1 = (1.f - a1); + + const int yi0 = utility::clamp<int>(yi, 0, input_height - 1); + const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1); + + const uint8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y; + const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; + + uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = xo * scale_x + fp_coord_offset_x; + // Integer coordinate + const int xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const float a = (xi_f - static_cast<float>(xi)); + const float b = (1.f - a); + + const float s00_s = b * b1; + const float s01_s = a * b1; + const float s10_s = b * a1; + const float s11_s = a * a1; + + const auto s00 = wrapper::vdup_n(s00_s, FloatTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, FloatTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, FloatTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, FloatTagType{}); + + const int xi0 = utility::clamp<int>(xi, 0, input_width - 1); + const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1); + + const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x; + const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x; + + uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); + const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); + const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(uint8_t)); + const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(uint8_t)); + + const uint16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); + const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); + + const auto in00_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), + vscale_in); + const auto in00_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), + vscale_in); + + const uint16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); + const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); + + const auto in01_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), + vscale_in); + const auto in01_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), + vscale_in); + + const uint16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); + const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); + + const auto in10_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), + vscale_in); + const auto in10_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), + vscale_in); + + const uint16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); + const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); + + const auto in11_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), + vscale_in); + const auto in11_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), + vscale_in); + + auto out_0 = wrapper::vmul(in00_0, s00); + out_0 = wrapper::vmla(out_0, in01_0, s01); + out_0 = wrapper::vmla(out_0, in10_0, s10); + out_0 = wrapper::vmla(out_0, in11_0, s11); + + auto out_1 = wrapper::vmul(in00_1, s00); + out_1 = wrapper::vmla(out_1, in01_1, s01); + out_1 = wrapper::vmla(out_1, in10_1, s10); + out_1 = wrapper::vmla(out_1, in11_1, s11); + + auto out_2 = wrapper::vmul(in00_2, s00); + out_2 = wrapper::vmla(out_2, in01_2, s01); + out_2 = wrapper::vmla(out_2, in10_2, s10); + out_2 = wrapper::vmla(out_2, in11_2, s11); + + auto out_3 = wrapper::vmul(in00_3, s00); + out_3 = wrapper::vmla(out_3, in01_3, s01); + out_3 = wrapper::vmla(out_3, in10_3, s10); + out_3 = wrapper::vmla(out_3, in11_3, s11); + +#if defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o)); + const auto out_1_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o)); + const auto out_2_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); + const auto out_3_int = wrapper::vcvta<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); +#else // defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o)); + const auto out_1_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o)); + const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); + const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); + + wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); + } + + for (; cout < out_dim_ch; ++cout) + { + const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); + const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); + const uint8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(uint8_t)); + const uint8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(uint8_t)); + + const float in00_f = (static_cast<int32_t>(in00) - iq_info.offset) * iq_info.scale; + const float in01_f = (static_cast<int32_t>(in01) - iq_info.offset) * iq_info.scale; + const float in10_f = (static_cast<int32_t>(in10) - iq_info.offset) * iq_info.scale; + const float in11_f = (static_cast<int32_t>(in11) - iq_info.offset) * iq_info.scale; + + float out = in00_f * s00_s; + out += in01_f * s01_s; + out += in10_f * s10_s; + out += in11_f * s11_s; + + // Rounding modes of vector and scalar loops should match +#if defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info); +#else // defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = + quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + } + } + } + } + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace +namespace cpu +{ +void qasymm8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + if (src->info()->quantization_info() == dst->info()->quantization_info()) + { + u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else + { + qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..5a885178a7 --- /dev/null +++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/helpers/ScaleHelpers.h" +#include "src/cpu/kernels/scale/neon/list.h" + +namespace arm_compute +{ +namespace +{ +void qasymm8_signed_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + // Data layout is NHWC + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const int32_t input_width = src->info()->dimension(1); + const int32_t input_height = src->info()->dimension(2); + + // Compute the ratio between source and destination dimensions + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + if (border_mode == BorderMode::CONSTANT) + { + const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; + const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; + + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(1, Window::Dimension(0, 0, 0)); + win_in.set(2, Window::Dimension(0, 0, 0)); + + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>()); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if (border_mode == BorderMode::REPLICATE) + { + using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; + using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; + + const int in_stride_x = src->info()->strides_in_bytes()[1]; + const int in_stride_y = src->info()->strides_in_bytes()[2]; + const int in_stride_b = src->info()->strides_in_bytes()[3]; + const int out_stride_x = dst->info()->strides_in_bytes()[1]; + const int out_stride_y = dst->info()->strides_in_bytes()[2]; + const int out_stride_b = dst->info()->strides_in_bytes()[3]; + const int out_dim_ch = dst->info()->dimension(0); + constexpr int step_cout = 16; + + Window window_execution = window; + window_execution.set(Window::DimX, Window::Dimension(0, 1, 1)); + Window win_in_out(window); + win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in_out); + Iterator out(dst, win_in_out); + + const int xo_start = window_execution[1].start(); + const int xo_end = window_execution[1].end(); + const int xo_step = window_execution[1].step(); + const int yo_start = window_execution[2].start(); + const int yo_end = window_execution[2].end(); + const int yo_step = window_execution[2].step(); + const int bo_start = window_execution[3].start(); + const int bo_end = window_execution[3].end(); + const int bo_step = window_execution[3].step(); + + const float fp_coord_offset_y = sampling_offset * (scale_y - 1); + const float fp_coord_offset_x = sampling_offset * (scale_x - 1); + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale_in = wrapper::vdup_n(iq_info.scale, FloatTagType{}); + const int32x4_t voffset_in = wrapper::vdup_n(iq_info.offset, Int32TagType{}); // Offsets will be Int32 + + const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{}); + const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset); + + for (int bo = bo_start; bo < bo_end; bo += bo_step) + { + const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b); + int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b); + + for (int yo = yo_start; yo < yo_end; yo += yo_step) + { + // Floating-point coordinate + const float yi_f = yo * scale_y + fp_coord_offset_y; + // Integer coordinate + const int yi = static_cast<int>(std::floor(yi_f)); + // Weight for the y coordinate + const float a1 = (yi_f - static_cast<float>(yi)); + const float b1 = (1.f - a1); + + const int yi0 = utility::clamp<int>(yi, 0, input_height - 1); + const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1); + + const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y; + const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; + + int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; + for (int xo = xo_start; xo < xo_end; xo += xo_step) + { + // Floating-point coordinate + const float xi_f = xo * scale_x + fp_coord_offset_x; + // Integer coordinate + const int xi = static_cast<int>(std::floor(xi_f)); + // Weight for the x coordinate + const float a = (xi_f - static_cast<float>(xi)); + const float b = (1.f - a); + + const float s00_s = b * b1; + const float s01_s = a * b1; + const float s10_s = b * a1; + const float s11_s = a * a1; + + const auto s00 = wrapper::vdup_n(s00_s, FloatTagType{}); + const auto s01 = wrapper::vdup_n(s01_s, FloatTagType{}); + const auto s10 = wrapper::vdup_n(s10_s, FloatTagType{}); + const auto s11 = wrapper::vdup_n(s11_s, FloatTagType{}); + + const int xi0 = utility::clamp<int>(xi, 0, input_width - 1); + const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1); + + const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x; + const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x; + const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x; + + int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; + + int cout = 0; + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) + { + const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); + const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); + const auto in10 = wrapper::vloadq(in_ptr_xi0_yi1 + cout * sizeof(int8_t)); + const auto in11 = wrapper::vloadq(in_ptr_xi1_yi1 + cout * sizeof(int8_t)); + + const int16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); + const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); + + const auto in00_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), + vscale_in); + const auto in00_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), + vscale_in); + + const int16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); + const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); + + const auto in01_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), + vscale_in); + const auto in01_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), + vscale_in); + + const int16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); + const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); + + const auto in10_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), + vscale_in); + const auto in10_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), + vscale_in); + + const int16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); + const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); + + const auto in11_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), + vscale_in); + const auto in11_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), + vscale_in); + + auto out_0 = wrapper::vmul(in00_0, s00); + out_0 = wrapper::vmla(out_0, in01_0, s01); + out_0 = wrapper::vmla(out_0, in10_0, s10); + out_0 = wrapper::vmla(out_0, in11_0, s11); + + auto out_1 = wrapper::vmul(in00_1, s00); + out_1 = wrapper::vmla(out_1, in01_1, s01); + out_1 = wrapper::vmla(out_1, in10_1, s10); + out_1 = wrapper::vmla(out_1, in11_1, s11); + + auto out_2 = wrapper::vmul(in00_2, s00); + out_2 = wrapper::vmla(out_2, in01_2, s01); + out_2 = wrapper::vmla(out_2, in10_2, s10); + out_2 = wrapper::vmla(out_2, in11_2, s11); + + auto out_3 = wrapper::vmul(in00_3, s00); + out_3 = wrapper::vmla(out_3, in01_3, s01); + out_3 = wrapper::vmla(out_3, in10_3, s10); + out_3 = wrapper::vmla(out_3, in11_3, s11); + +#if defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o)); + const auto out_1_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o)); + const auto out_2_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); + const auto out_3_int = wrapper::vcvta<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); +#else // defined(__aarch64__) && !defined(BARE_METAL) + const auto out_0_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_0, invvscale_o)); + const auto out_1_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_1, invvscale_o)); + const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); + const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); + + wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); + } + + for (; cout < out_dim_ch; ++cout) + { + const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); + const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); + const int8_t in10 = *(in_ptr_xi0_yi1 + cout * sizeof(int8_t)); + const int8_t in11 = *(in_ptr_xi1_yi1 + cout * sizeof(int8_t)); + + const float in00_f = (static_cast<int32_t>(in00) - iq_info.offset) * iq_info.scale; + const float in01_f = (static_cast<int32_t>(in01) - iq_info.offset) * iq_info.scale; + const float in10_f = (static_cast<int32_t>(in10) - iq_info.offset) * iq_info.scale; + const float in11_f = (static_cast<int32_t>(in11) - iq_info.offset) * iq_info.scale; + + float out = in00_f * s00_s; + out += in01_f * s01_s; + out += in10_f * s10_s; + out += in11_f * s11_s; + + // Rounding modes of vector and scalar loops should match +#if defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info); +#else // defined(__aarch64__) && !defined(BARE_METAL) + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = + quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO); +#endif // defined(__aarch64__) && !defined(BARE_METAL) + } + } + } + } + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace +namespace cpu +{ +void qasymm8_signed_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + if (policy == InterpolationPolicy::BILINEAR) + { + if (src->info()->quantization_info() == dst->info()->quantization_info() && + border_mode == BorderMode::REPLICATE) + { + s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); + } + else + { + qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, + sampling_offset, align_corners, window); + } + } + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp new file mode 100644 index 0000000000..cb28f4cb1c --- /dev/null +++ b/src/cpu/kernels/scale/sve/fp16.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void fp16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); +} +} // namespace +namespace cpu +{ +void fp16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp new file mode 100644 index 0000000000..cbb345edbb --- /dev/null +++ b/src/cpu/kernels/scale/sve/fp32.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void fp32_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Store results + svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + out); +} +} // namespace +namespace cpu +{ +void fp32_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp new file mode 100644 index 0000000000..df950b1789 --- /dev/null +++ b/src/cpu/kernels/scale/sve/integer.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void u8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); +} + +void s16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); +} +} // namespace +namespace cpu +{ +void u8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not Implemented"); + } +} + +void s16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not Implemented"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h new file mode 100644 index 0000000000..aff741a4a7 --- /dev/null +++ b/src/cpu/kernels/scale/sve/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H +#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) + +DECLARE_SCALE_KERNEL(fp16_sve_scale); +DECLARE_SCALE_KERNEL(fp32_sve_scale); +DECLARE_SCALE_KERNEL(s16_sve_scale); +DECLARE_SCALE_KERNEL(u8_sve_scale); +DECLARE_SCALE_KERNEL(qasymm8_sve_scale); +DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale); + +#undef DECLARE_SCALE_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */ diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp new file mode 100644 index 0000000000..0fc794c6c2 --- /dev/null +++ b/src/cpu/kernels/scale/sve/qasymm8.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void qasymm8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); +} +} // namespace +namespace cpu +{ +void qasymm8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not Implemented"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp new file mode 100644 index 0000000000..68ea01e29e --- /dev/null +++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void qasymm8_signed_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); +} +} // namespace +namespace cpu +{ +void qasymm8_signed_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } + else + { + ARM_COMPUTE_ERROR("Not Implemented"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp new file mode 100644 index 0000000000..38a58099bd --- /dev/null +++ b/src/cpu/kernels/select/generic/neon/fp16.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_f16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window); +} +void neon_f16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<float16_t>(c, x, y, output, window); +} + +} // namespace cpu + +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp new file mode 100644 index 0000000000..50a80cb338 --- /dev/null +++ b/src/cpu/kernels/select/generic/neon/fp32.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void neon_f32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_32<float, uint32x4_t>(c, x, y, output, window); +} +void neon_f32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<float>(c, x, y, output, window); +} + +} // namespace cpu + +} // namespace arm_compute diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h new file mode 100644 index 0000000000..7ce640b6ff --- /dev/null +++ b/src/cpu/kernels/select/generic/neon/impl.h @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/NEON/NEAsymm.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" + +#include <arm_neon.h> +#include <map> +#include <string> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, typename VectorType> +void select_op(const ITensor *cond, + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + const int window_step_x, + const int window_start_x, + const int window_end_x, + const int limit, + VectorType (*condition_conversion)(const uint8_t *)) +{ + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator condition(cond, win); + Iterator input1(in1, win); + Iterator input2(in2, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + + int x = window_start_x; + for (; x <= limit; x += window_step_x) + { + const auto c = (*condition_conversion)(condition_ptr + x); + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); + } + for (; x < window_end_x; ++x) + { + const auto c = *(condition_ptr + x); + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = static_cast<bool>(c) ? a : b; + } + }, + condition, input1, input2, output); +} + +template <typename ScalarType, typename VectorType> +void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + const auto window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); + }); +} + +template <typename ScalarType, typename VectorType> +void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + const auto window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); + }); +} + +template <typename ScalarType, typename VectorType> +void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + const auto window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); + }); +} + +template <typename ScalarType> +void select_op_not_same_rank( + const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + ARM_COMPUTE_UNUSED(window); + + auto output_ptr = reinterpret_cast<ScalarType *>(out->buffer()); + const auto condition_ptr = reinterpret_cast<const uint8_t *>(cond->buffer()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(in1->buffer()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(in2->buffer()); + + const int outer_size = cond->info()->total_size() / cond->info()->element_size(); + const int inner_size = (in1->info()->total_size() / in1->info()->element_size()) / outer_size; + int offset = 0; + const int step = 16 / in1->info()->element_size(); + + for (int i = 0; i < outer_size; ++i) + { + int x = offset; + const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr; + for (; x <= offset + inner_size - step; x += step) + { + wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x)); + } + if (x <= offset + inner_size - (step / 2)) + { + wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x)); + x += step / 2; + } + for (; x < offset + inner_size; ++x) + { + *(output_ptr + x) = *(input_ptr + x); + } + offset += inner_size; + } +} +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp new file mode 100644 index 0000000000..135087c261 --- /dev/null +++ b/src/cpu/kernels/select/generic/neon/integer.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/select/generic/neon/impl.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +void neon_s8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window); +} +void neon_s16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window); +} +void neon_s32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window); +} +void neon_s8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<int8_t>(c, x, y, output, window); +} +void neon_s16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<int16_t>(c, x, y, output, window); +} +void neon_s32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<int32_t>(c, x, y, output, window); +} +void neon_u8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window); +} +void neon_u16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window); +} +void neon_u32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window); +} +void neon_u8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<uint8_t>(c, x, y, output, window); +} +void neon_u16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<uint16_t>(c, x, y, output, window); +} +void neon_u32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +{ + return select_op_not_same_rank<uint32_t>(c, x, y, output, window); +} + +} // namespace cpu + +} // namespace arm_compute diff --git a/src/cpu/kernels/select/list.h b/src/cpu/kernels/select/list.h new file mode 100644 index 0000000000..c33a25f6d6 --- /dev/null +++ b/src/cpu/kernels/select/list.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SELECT_LIST_H +#define SRC_CORE_NEON_KERNELS_SELECT_LIST_H + +#include "arm_compute/core/ITensor.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SELECT_KERNEL(func_name) \ + void func_name(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) + +DECLARE_SELECT_KERNEL(neon_s8_select_same_rank); +DECLARE_SELECT_KERNEL(neon_s16_select_same_rank); +DECLARE_SELECT_KERNEL(neon_s32_select_same_rank); +DECLARE_SELECT_KERNEL(neon_u8_select_same_rank); +DECLARE_SELECT_KERNEL(neon_u16_select_same_rank); +DECLARE_SELECT_KERNEL(neon_u32_select_same_rank); +DECLARE_SELECT_KERNEL(neon_f16_select_same_rank); +DECLARE_SELECT_KERNEL(neon_f32_select_same_rank); + +DECLARE_SELECT_KERNEL(neon_s8_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_s16_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_s32_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_u8_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_u16_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_u32_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_f16_select_not_same_rank); +DECLARE_SELECT_KERNEL(neon_f32_select_not_same_rank); + +#undef DECLARE_RANGE_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif //SRC_CORE_NEON_KERNELS_SELECT_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp new file mode 100644 index 0000000000..425fcf7ac6 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/softmax/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ + +template <bool IS_LOG> +void neon_fp16_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + if (axis == 0) + { + return neon_softmax_x_float<float16_t, IS_LOG>(in, tmp, out, beta, axis, window); + } + else + { + return neon_softmax_non_x_float<float16_t, IS_LOG>(in, tmp, out, beta, axis, window); + } +} + +template void neon_fp16_softmax<true>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); +template void neon_fp16_softmax<false>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +} // namespace cpu +} // namespace arm_compute +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp new file mode 100644 index 0000000000..a64946eb74 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/softmax/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ + +template <bool IS_LOG> +void neon_fp32_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + if (axis == 0) + { + return neon_softmax_x_float<float, IS_LOG>(in, tmp, out, beta, axis, window); + } + else + { + return neon_softmax_non_x_float<float, IS_LOG>(in, tmp, out, beta, axis, window); + } +} + +template void neon_fp32_softmax<true>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); +template void neon_fp32_softmax<false>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp new file mode 100644 index 0000000000..31baf8a9df --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/softmax/generic/neon/impl.h" + +#include "support/SaturateCast.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T, bool IS_LOG> +void neon_softmax_x_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window) +{ + ARM_COMPUTE_UNUSED(axis); + + static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, + "quantized type should be either qasymm8_t or qasymm8_signed_t."); + + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); + + constexpr int vec_size = 16; + +#ifndef __aarch64__ + const int sum_stages = log2(vec_size >> 1); +#endif // __aarch64__ + + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + float *tmp_ptr = reinterpret_cast<float *>(tmp); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // Compute Max + + float sum_transformed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (IS_LOG) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); + } + + /* Reduce sum */ + const float32x4_t sum_16_byte = + vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + + float sum; + +#ifdef __aarch64__ + sum = wrapper::vaddv(sum_16_byte); +#else // __aarch64__ + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ + + /* Run remaining elements */ + for (; x < input_width; ++x) + { + float element{}; + if (IS_LOG) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } + + tmp_ptr[x] = element; + } + + if (!IS_LOG) + { + sum_transformed = 256.f / sum; + } + else + { + sum_transformed = std::log(sum); + } + } // Compute exponentials and sum + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + + const float32x4_t sum_vec = vdupq_n_f32(sum_transformed); + + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) + { + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if (IS_LOG) + { + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], sum_vec), + vsubq_f32(vec_in.val[1], sum_vec), + vsubq_f32(vec_in.val[2], sum_vec), + vsubq_f32(vec_in.val[3], sum_vec), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], sum_vec), + vmulq_f32(vec_in.val[1], sum_vec), + vmulq_f32(vec_in.val[2], sum_vec), + vmulq_f32(vec_in.val[3], sum_vec), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); + } + /* Run remaining elements */ + for (; x < input_width; ++x) + { + if (IS_LOG) + { + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed); + } + else + { + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_transformed) - + (is_qasymm8_signed ? 128.f : 0)); + } + } + } // Normalize exponentials + }, + in_it, out_it); +} + +template <typename T, bool IS_LOG> +void neon_softmax_non_x_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window) +{ + static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, + "quantized type should be either qasymm8_t or qasymm8_signed_t."); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator out_it(out, window); + + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + constexpr int vec_size = 16; + const ITensorInfo *in_info = in->info(); + const ITensorInfo *out_info = out->info(); + const int x_width = in_info->valid_region().shape.x(); + const int in_axis_stride = in_info->strides_in_bytes()[axis]; + const int out_axis_stride = out_info->strides_in_bytes()[axis]; + const int tmp_axis_stride = in_axis_stride; + const int axis_width = in_info->dimension(axis); + const int end_actual = std::min(window[0].end(), x_width); + + execute_window_loop( + window, + [&](const Coordinates &winCoords) + { + const bool vector_exceeds_bounds = ((winCoords[0] + vec_size) > end_actual); + + int num_remaining = (end_actual - winCoords[0]); + int num_remaining_full = num_remaining / 4; + int num_remaining_partial = num_remaining % 4; + + /* Get pointers */ + const uint8_t *in_ptr = in_it.ptr(); + uint8_t *out_ptr = out_it.ptr(); + uint8_t *tmp_ptr = reinterpret_cast<uint8_t *>(tmp); + + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + + /* Compute Max */ + { + if (!vector_exceeds_bounds) + { + int i = 0; + for (; i < axis_width; ++i) + { + const auto current_value = + wrapper::vloadq((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr)); + vec_max = wrapper::vmax(vec_max, current_value); + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + const T *const base_ptr_in = ((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr)); + int j = 0; + for (; j < num_remaining; ++j) + { + const T current_value = *(base_ptr_in + j); + vec_max[j] = std::max(vec_max[j], current_value); + } + } + } + } // Compute Max + + float32x4x4_t vec_sum_transformed = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Compute exponentials and sum */ + { + /* Init sum to zero */ + float32x4x4_t vec_sum = vec_sum_transformed; + + auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + + float32x4x4_t vec_elements_flt; + + if (!vector_exceeds_bounds) + { + int i = 0; + for (; i < axis_width; ++i) + { + vec_elements = wrapper::vloadq((i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr)); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (IS_LOG) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + vst4q_f32((i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr), vec_elements_flt); + } + + auto vec_256 = wrapper::vdup_n(static_cast<float32_t>(256.f), ExactTagType{}); + if (!IS_LOG) + { + vec_sum_transformed.val[0] = wrapper::vdiv(vec_256, vec_sum.val[0]); + vec_sum_transformed.val[1] = wrapper::vdiv(vec_256, vec_sum.val[1]); + vec_sum_transformed.val[2] = wrapper::vdiv(vec_256, vec_sum.val[2]); + vec_sum_transformed.val[3] = wrapper::vdiv(vec_256, vec_sum.val[3]); + } + else + { + vec_sum_transformed.val[0] = wrapper::vlog(vec_sum.val[0]); + vec_sum_transformed.val[1] = wrapper::vlog(vec_sum.val[1]); + vec_sum_transformed.val[2] = wrapper::vlog(vec_sum.val[2]); + vec_sum_transformed.val[3] = wrapper::vlog(vec_sum.val[3]); + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + const T *const base_ptr_in = (i * in_axis_stride) + reinterpret_cast<const T *>(in_ptr); + auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + //vec_els is functionally redundant but is needed as a workaround for a toolchain bug. + std::vector<T> vec_els(16); + + for (int k = 0; k < num_remaining_full; ++k) + { + for (int j = 0; j < 4; ++j) + { + vec_els[k * 4 + j] = *(base_ptr_in + (4 * k + j)); + } + } + for (int j = 0; j < num_remaining_partial; ++j) + { + vec_els[num_remaining_full * 4 + j] = *(base_ptr_in + (4 * num_remaining_full + j)); + } + for (int q = 0; q < 16; q++) + { + vec_elements[q] = vec_els[q]; + } + vec_elements = wrapper::vqsub(vec_max, vec_elements); + float32x4x4_t vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (IS_LOG) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr); + for (int k = 0; k < num_remaining_full; ++k) + { + for (int j = 0; j < 4; ++j) + { + *(base_ptr_tmp + (4 * k + j)) = vec_elements_flt.val[k][j]; + } + } + + for (int j = 0; j < num_remaining_partial; ++j) + { + *(base_ptr_tmp + (4 * num_remaining_full + j)) = + vec_elements_flt.val[num_remaining_full][j]; + } + } + + auto vec_256 = wrapper::vdup_n(static_cast<float32_t>(256), ExactTagType{}); + if (!IS_LOG) + { + vec_sum_transformed.val[0] = wrapper::vdiv(vec_256, vec_sum.val[0]); + vec_sum_transformed.val[1] = wrapper::vdiv(vec_256, vec_sum.val[1]); + vec_sum_transformed.val[2] = wrapper::vdiv(vec_256, vec_sum.val[2]); + vec_sum_transformed.val[3] = wrapper::vdiv(vec_256, vec_sum.val[3]); + } + else + { + vec_sum_transformed.val[0] = wrapper::vlog(vec_sum.val[0]); + vec_sum_transformed.val[1] = wrapper::vlog(vec_sum.val[1]); + vec_sum_transformed.val[2] = wrapper::vlog(vec_sum.val[2]); + vec_sum_transformed.val[3] = wrapper::vlog(vec_sum.val[3]); + } + } + } // Compute exponentials and sum + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + if (!vector_exceeds_bounds) + { + int i = 0; + for (; i < axis_width; ++i) + { + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32((i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr)); + + int_vec_type normalized_value{}; + + if (IS_LOG) + { + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]), + vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]), + vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]), + vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], vec_sum_transformed.val[0]), + vmulq_f32(vec_in.val[1], vec_sum_transformed.val[1]), + vmulq_f32(vec_in.val[2], vec_sum_transformed.val[2]), + vmulq_f32(vec_in.val[3], vec_sum_transformed.val[3]), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore((i * out_axis_stride) + reinterpret_cast<T *>(out_ptr), normalized_value); + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + T *const base_ptr_out = (i * out_axis_stride) + reinterpret_cast<T *>(out_ptr); + float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr); + if (IS_LOG) + { + for (int k = 0; k < num_remaining_full; ++k) + { + for (int j = 0; j < 4; ++j) + { + *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast<T>( + (*(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j])); + } + } + for (int j = 0; j < num_remaining_partial; ++j) + { + *(base_ptr_out + (4 * num_remaining_full + j)) = + utils::cast::saturate_cast<T>(*(base_ptr_tmp + (4 * num_remaining_full + j)) - + vec_sum_transformed.val[num_remaining_full][j]); + } + } + else + { + for (int k = 0; k < num_remaining_full; ++k) + { + for (int j = 0; j < 4; ++j) + { + *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast<T>( + *(base_ptr_tmp + (4 * k + j)) * vec_sum_transformed.val[k][j] - + (is_qasymm8_signed ? 128.f : 0)); + } + } + for (int j = 0; j < num_remaining_partial; ++j) + { + *(base_ptr_out + (4 * num_remaining_full + j)) = + utils::cast::saturate_cast<T>(*(base_ptr_tmp + (4 * num_remaining_full + j)) * + vec_sum_transformed.val[num_remaining_full][j] - + (is_qasymm8_signed ? 128.f : 0)); + } + } + } + } + } // Normalize exponentials + }, + in_it, out_it); +} + +template void neon_softmax_x_quantized<qasymm8_signed_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_x_quantized<qasymm8_signed_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_x_quantized<qasymm8_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_x_quantized<qasymm8_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_non_x_quantized<qasymm8_signed_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_non_x_quantized<qasymm8_signed_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_non_x_quantized<qasymm8_t, true>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template void neon_softmax_non_x_quantized<qasymm8_t, false>( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h new file mode 100644 index 0000000000..e417271d0e --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" + +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ + +#ifdef __aarch64__ +namespace +{ +// These helper functions are added because vaddv does not exist for fp16, +// and, therefore, is not part of the wrapper::vaddv interface. +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +inline float16_t wrapper_vaddv(const float16x8_t &a, int sum_stages) +{ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(a), wrapper::vgetlow(a)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + return wrapper::vgetlane(sum_res, 0); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +inline float wrapper_vaddv(const float32x4_t &a, int sum_stages) +{ + ARM_COMPUTE_UNUSED(sum_stages); + return wrapper::vaddv(a); +} +} // namespace +#endif // __aarch64__ + +// The template implementation for float data types is stored in the header file because +// we need all fp16 instantiated code to live in fp16.cpp files. +template <typename T, bool IS_LOG> +void neon_softmax_x_float(const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window) +{ + ARM_COMPUTE_UNUSED(axis); + ARM_COMPUTE_UNUSED(tmp); + + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator out_it(out, window); + + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + constexpr int vec_size = 16 / sizeof(T); + + const int sum_stages = log2(vec_size >> 1); + + const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}); + + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const T *in_ptr = reinterpret_cast<const T *>(in_it.ptr()); + T *out_ptr = reinterpret_cast<T *>(out_it.ptr()); + + T max_val; + + /* Compute Max */ + { + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = 0; + + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + +#ifdef __aarch64__ + max_val = wrapper::vmaxv(vec_max); +#else // __aarch64__ + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + + max_val = wrapper::vgetlane(carry_max, 0); +#endif // __aarch64__ + + // Compute left-over elements + for (; x < input_width; ++x) + { + max_val = std::max(*(in_ptr + x), max_val); + } + } // compute max + + T sum_transformed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); + + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (IS_LOG) + { + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(out_ptr + x, vec_elements); + } + + /* Reduce sum */ + T sum{}; +#ifdef __aarch64__ + sum = wrapper_vaddv(vec_sum, sum_stages); +#else // __aarch64__ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for (int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + sum = wrapper::vgetlane(sum_res, 0); +#endif // __aarch64__ + + /* Run remaining elements */ + for (; x < input_width; ++x) + { + T element{}; + + if (IS_LOG) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + + out_ptr[x] = element; + } + + if (!IS_LOG) + { + sum_transformed = T(1) / sum; + } + else + { + sum_transformed = static_cast<T>(std::log(sum)); + } + } // Compute exponentials and sum + + /* Normalize exponentials */ + { + const auto sum_vec = wrapper::vdup_n(static_cast<T>(sum_transformed), ExactTagType{}); + + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) + { + const auto vec_in = wrapper::vloadq(out_ptr + x); + if (IS_LOG) + { + wrapper::vstore(out_ptr + x, wrapper::vsub(vec_in, sum_vec)); + } + else + { + wrapper::vstore(out_ptr + x, wrapper::vmul(vec_in, sum_vec)); + } + } + + /* Run remaining elements */ + for (; x < input_width; ++x) + { + if (IS_LOG) + { + out_ptr[x] = out_ptr[x] - sum_transformed; + } + else + { + out_ptr[x] = out_ptr[x] * sum_transformed; + } + } + } // Normalize exponentials + }, + in_it, out_it); +} +template <typename T, bool IS_LOG> +void neon_softmax_non_x_float( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window) +{ + ARM_COMPUTE_UNUSED(tmp); + + Iterator in_it(in, window); + Iterator out_it(out, window); + + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + const auto beta_vec = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}); + constexpr int vec_size = 16 / sizeof(T); + const ITensorInfo *in_info = in->info(); + const ITensorInfo *out_info = out->info(); + const int x_width = in_info->valid_region().shape.x(); + const unsigned int in_axis_stride = in_info->strides_in_bytes()[axis]; + const unsigned int out_axis_stride = out_info->strides_in_bytes()[axis]; + const int axis_width = in_info->dimension(axis); + + execute_window_loop( + window, + [&](const Coordinates &winCoords) + { + const bool vector_exceeds_bounds = (winCoords[0] + vec_size) > x_width; + + /* Get pointers */ + const uint8_t *in_ptr = in_it.ptr(); + uint8_t *out_ptr = out_it.ptr(); + + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + + /* Compute Max */ + { + if (!vector_exceeds_bounds) + { + int i = 0; + for (; i < axis_width; ++i) + { + const auto current_value = + wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr)); + vec_max = wrapper::vmax(vec_max, current_value); + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + const T *const base_ptr_in = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr); + int j = 0; + for (; j < (x_width - winCoords[0]); ++j) + { + const auto current_value = *(base_ptr_in + j); + vec_max[j] = std::max(vec_max[j], current_value); + } + } + } + } // compute max + + auto vec_sum_transformed = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + + auto vec_elements = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + + /* Compute exponentials and sum */ + { + if (!vector_exceeds_bounds) + { + const auto vec_one = wrapper::vdup_n(static_cast<T>(1), ExactTagType{}); + /* Loop over row and compute exponentials and sum */ + int i = 0; + for (; i < axis_width; ++i) + { + vec_elements = wrapper::vloadq(reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr)); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (IS_LOG) + { + vec_elements = wrapper::vmul(vec_elements, beta_vec); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, beta_vec)); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + + wrapper::vstore(reinterpret_cast<T *>((i * out_axis_stride) + out_ptr), vec_elements); + } + + if (!IS_LOG) + { + vec_sum_transformed = wrapper::vdiv(vec_one, vec_sum); + } + else + { + vec_sum_transformed = wrapper::vlog(vec_sum); + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + const T *const base_ptr_in = reinterpret_cast<const T *>((i * in_axis_stride) + in_ptr); + T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr); + int j = 0; + for (; j < (x_width - winCoords[0]); ++j) + { + vec_elements[j] = *(base_ptr_in + j); + vec_elements[j] -= vec_max[j]; + if (IS_LOG) + { + vec_elements[j] *= beta; + vec_sum[j] += std::exp(vec_elements[j]); + } + else + { + vec_elements[j] = std::exp(vec_elements[j] * beta); + vec_sum[j] += vec_elements[j]; + } + *(base_ptr_out + j) = vec_elements[j]; + } + } + int j = 0; + for (; j < (x_width - winCoords[0]); ++j) + { + if (!IS_LOG) + { + vec_sum_transformed[j] = 1 / vec_sum[j]; + } + else + { + vec_sum_transformed[j] = std::log(vec_sum[j]); + } + } + } + } // Compute exponentials and sum + + /* Normalize exponentials */ + { + if (!vector_exceeds_bounds) + { + /* Loop over row and compute softmax */ + int i = 0; + for (; i < axis_width; ++i) + { + T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr); + auto vec_in = wrapper::vloadq(base_ptr_out); + if (IS_LOG) + { + wrapper::vstore(base_ptr_out, wrapper::vsub(vec_in, vec_sum_transformed)); + } + else + { + wrapper::vstore(base_ptr_out, wrapper::vmul(vec_in, vec_sum_transformed)); + } + } + } + else + { + int i = 0; + for (; i < axis_width; ++i) + { + T *const base_ptr_out = reinterpret_cast<T *>((i * out_axis_stride) + out_ptr); + int j = 0; + for (; j < (x_width - winCoords[0]); ++j) + { + if (IS_LOG) + { + *(base_ptr_out + j) -= vec_sum_transformed[j]; + } + else + { + *(base_ptr_out + j) *= vec_sum_transformed[j]; + } + } + } + } + } // Normalize exponentials + }, + in_it, out_it); +} +template <typename T, bool IS_LOG> +void neon_softmax_x_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); + +template <typename T, bool IS_LOG> +void neon_softmax_non_x_quantized( + const ITensor *in, void *const tmp, ITensor *out, float beta, int axis, const Window &window); +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..369f9bb005 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/softmax/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <bool IS_LOG> +void neon_qasymm8_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + if (axis == 0) + { + return neon_softmax_x_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, axis, window); + } + else + { + return neon_softmax_non_x_quantized<qasymm8_t, IS_LOG>(in, tmp, out, beta, axis, window); + } +} + +template void neon_qasymm8_softmax<true>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); +template void neon_qasymm8_softmax<false>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..594ceb7654 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" + +#include "src/cpu/kernels/softmax/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +template <bool IS_LOG> +void neon_qasymm8_signed_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + if (axis == 0) + { + return neon_softmax_x_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, axis, window); + } + else + { + return neon_softmax_non_x_quantized<qasymm8_signed_t, IS_LOG>(in, tmp, out, beta, axis, window); + } +} + +template void neon_qasymm8_signed_softmax<true>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); +template void neon_qasymm8_signed_softmax<false>(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp new file mode 100644 index 0000000000..e70c9f4793 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp @@ -0,0 +1,781 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// SoftMax +// +// Steps: +// * Find max: max_value = max(src) +// * Regularize: dst[i] = exp(src[i] - max_value) +// sum_value = sum(dst) +// * Normalize: dst[i] = dst[i] / sum_value +void sme2_f16_softmax_kernel( // + const float16_t *src, + float16_t *dst, + float beta, + const uintptr_t shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4]) +{ + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + + // Registers + // + // * x9: temporary, index + // * x10: temporary, -inf + // * x11: temporary, 0 + // * x12: temporary, 1.0f + // * x13: temporary, body_length + // + // * x20: index_3 + // * x21: src_3 + // * x22: dst_3 + // * x23: index_2 + // * x24: src_2 + // * x25: dst_2 + // * x26: index_1 + // * x27: src_1 + // * x28: dst_1 + // + // * z0: c1 + // * z1: c2 + // * z2: c3 + // * z3: c4 + // * z4: c5 + // * z5: shift + // * z6: inv_ln2 + // * z7: neg_ln2_hi + // * z8: neg_ln2_lo + // * z9: min_input + // * z10: 23, 0 + // * z11: max_value + // * z12-z15: x, x_fp32_lower_halves, r_hi, r, r2 + // * z16-z19: max_value, shift, z, scale, poly + // * z20-z21: n, p1, p12345 + // * z22-z23: n, p23, p2345 + // * z24-z25: p45 + // * z26: beta + // * z28-z31: sum_value, x_fp32_upper_halves + // + // * za0-za3: sum_value + // + // * p0: all-true + // * p1: left-over predicate for find-max & normalize loops + // * p2-p4: left-over predicates for regularize loop + // * p4-p7: underflow in vector loop + // * p5-p6: underflow in leftover loop + // * + // * pn9: all-true + + // Prepares all constant values + + ptrue p0.b + .inst 0x25207811 // ptrue pn9.b + + mov w9, #0xfff6 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + mov w10, #0xfedb // c2: 0x1.fffdb6p-2f = 0x3efffedb + mov w11, #0xaf33 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + mov w12, #0x9f17 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + mov w13, #0x2010 // c5: 0x1.0e4020p-7f = 0x3c072010 + + movk w9, #0x3f7f, LSL #16 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + movk w10, #0x3eff, LSL #16 // c2: 0x1.fffdb6p-2f = 0x3efffedb + movk x11, #0x3e2a, LSL #16 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + movk w12, #0x3d2b, LSL #16 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + movk w13, #0x3c07, LSL #16 // c5: 0x1.0e4020p-7f = 0x3c072010 + + dup z0.s, w9 // c1. + dup z1.s, w10 // c2. + dup z2.s, w11 // c3. + dup z3.s, w12 // c4. + dup z4.s, w13 // c5. + + mov w9, #0x007f // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + mov w10, #0xaa3b // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + mov w11, #0x7200 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + mov w12, #0xbe8e // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + mov w13, #0x47ae // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + + movk w9, #0x4b00, LSL #16 // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + movk w10, #0x3fb8, LSL #16 // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + movk w11, #0xbf31, LSL #16 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + movk w12, #0xb5bf, LSL #16 // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + movk w13, #0xc2ad, LSL #16 // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + + dup z5.s, w9 // shift + dup z6.s, w10 // inv_ln2 + dup z7.s, w11 // neg_ln2_hi + dup z8.s, w12 // neg_ln2_lo + dup z9.s, w13 // min_input + + dup z26.s, %w[beta] // beta + fcvt h26, s26 + dup z26.h, z26.h[0] + + mov w10, #0xfc00 // -inf: 0xfc00 for fp16 + + mov w11, #0 // 0 + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cnth x13, ALL, MUL #4 + udiv x9, %x[length], x13 + mul x13, x13, x9 + + // ================================================== + // 3D loop opening + // ================================================== + + mov x20, %x[shape_3] + mov x21, %x[src] + mov x22, %x[dst] + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x20, #0 + b.eq loop_3_end%= + sub x20, x20, #1 + + mov x23, %x[shape_2] + mov x24, x21 + mov x25, x22 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x23, #0 + b.eq loop_2_end%= + sub x23, x23, #1 + + mov x26, %x[shape_1] + mov x27, x24 + mov x28, x25 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x26, #0 + b.eq loop_1_end%= + sub x26, x26, #1 + + // ================================================== + // Step 1: Find max + // ================================================== + + // ---------------------------------------------------------------- z16-z19: max_value = -inf + dup z16.h, w10 + dup z17.h, w10 + dup z18.h, w10 + dup z19.h, w10 + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // x9: index + dup z11.h, w10 // z11: max_value = -inf + +find_max_body_start%=: + cmp x9, x13 + b.eq find_max_body_end%= + + .inst 0xa009a76c // ld1h {z12.h-z15.h}, pn9/z, [x27, x9, LSL #1] // z12-z15: x + .inst 0xc16cb910 // fmax {z16.h-z19.h}, {z16.h-z19.h}, {z12.h-z15.h} // z16-z19: max_value = max(max_value, x) + + inch x9, ALL, MUL #4 + b find_max_body_start%= +find_max_body_end%=: + + // Loop for processing the leftover part. +find_max_leftover_start%=: + whilelo p1.h, x9, %x[length] + b.none find_max_leftover_end%= + + ld1h z12.h, p1/z, [x27, x9, LSL #1] // z12: x + fmax z16.h, p1/m, z16.h, z12.h // z16: max_value = max(max_value, x) + + inch x9 + b find_max_leftover_start%= +find_max_leftover_end%=: + + // ---------------------------------------------------------------- z16: max_value + .inst 0xc172b110 // fmax {z16.h-z17.h}, {z16.h-z17.h}, {z18.s-z19.h} + fmax z16.h, p0/m, z16.h, z17.h + fmaxv h16, p0, z16.h + + // ---------------------------------------------------------------- z11: max_value + dup z11.h, z16.h[0] + + // ================================================== + // Step 2: Regularize, i.e. Calculate exp(x - max(x) + // ================================================== + + .inst 0xc00800ff // zero {za0.s, za1.s, za2.s, za3.s} za0-za3: sum_value (in fp32) + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // ---------------------------------------------------- x9: index + +regularize_body_start%=: + cmp x9, x13 + b.eq regularize_body_end%= + + // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data + .inst 0xa009a76c // ld1h {z12.h-z15.h}, pn9/z, [x27, x9, LSL #1] // z12-z15: x + + // ---------------------------------------------------------------- z12-z15: x = input_data - max_value + fsub z12.h, z12.h, z11.h + fsub z13.h, z13.h, z11.h + fsub z14.h, z14.h, z11.h + fsub z15.h, z15.h, z11.h + + // ---------------------------------------------------------------- z12-z15: x = (input_data - max_value) * beta + fmul z12.h, z12.h, z26.h + fmul z13.h, z13.h, z26.h + fmul z14.h, z14.h, z26.h + fmul z15.h, z15.h, z26.h + + // ---------------------------------------------------------------- + // Convert fp16 values to fp32. This results in four more registers. + // z12 --> z12, z28 + fcvtlt z28.s, p0/m, z12.h + fcvt z12.s, p0/m, z12.h + + // z13 --> z13, z29 + fcvtlt z29.s, p0/m, z13.h + fcvt z13.s, p0/m, z13.h + + // z14 --> z14, z30 + fcvtlt z30.s, p0/m, z14.h + fcvt z14.s, p0/m, z14.h + + // z15 --> z15, z31 + fcvtlt z31.s, p0/m, z15.h + fcvt z15.s, p0/m, z15.h + + // ---------------------------------------------------------------- + // Process z12-z15 + // ---------------------------------------------------------------- + // ---------------------------------------------------------------- z16-z19: shift + mov z16.d, z5.d + mov z17.d, z5.d + mov z18.d, z5.d + mov z19.d, z5.d + + // ---------------------------------------------------------------- p4-p7: underflow = x < min_input + fcmlt p4.s, p0/z, z12.s, z9.s + fcmlt p5.s, p0/z, z13.s, z9.s + fcmlt p6.s, p0/z, z14.s, z9.s + fcmlt p7.s, p0/z, z15.s, z9.s + + // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2 + fmla z16.s, p0/m, z12.s, z6.s + fmla z17.s, p0/m, z13.s, z6.s + fmla z18.s, p0/m, z14.s, z6.s + fmla z19.s, p0/m, z15.s, z6.s + + // ---------------------------------------------------------------- z20-z23: n = z - shift + fsub z20.s, z16.s, z5.s + fsub z21.s, z17.s, z5.s + fsub z22.s, z18.s, z5.s + fsub z23.s, z19.s, z5.s + + // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi + fmla z12.s, p0/m, z20.s, z7.s + fmla z13.s, p0/m, z21.s, z7.s + fmla z14.s, p0/m, z22.s, z7.s + fmla z15.s, p0/m, z23.s, z7.s + + // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo + fmla z12.s, p0/m, z20.s, z8.s + fmla z13.s, p0/m, z21.s, z8.s + fmla z14.s, p0/m, z22.s, z8.s + fmla z15.s, p0/m, z23.s, z8.s + + // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n) + dup z10.s, #23 + urshl z16.s, p0/m, z16.s, z10.s + urshl z17.s, p0/m, z17.s, z10.s + urshl z18.s, p0/m, z18.s, z10.s + urshl z19.s, p0/m, z19.s, z10.s + + // Processes the first 2 vectors. (z12-z13) + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z12.s, z0.s + fmul z21.s, z13.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z12.s, z2.s + fmla z23.s, p0/m, z13.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z12.s, z4.s + fmla z25.s, p0/m, z13.s, z4.s + + // ---------------------------------------------------------------- z12-z13: r2 = r * r + fmul z12.s, z12.s, z12.s + fmul z13.s, z13.s, z13.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z12.s, z24.s + fmla z23.s, p0/m, z13.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z12.s, z22.s + fmla z21.s, p0/m, z13.s, z23.s + + // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale + fmla z16.s, p0/m, z20.s, z16.s + fmla z17.s, p0/m, z21.s, z17.s + + // Processes the last 2 vectors (z14-z15) + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z14.s, z0.s + fmul z21.s, z15.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z14.s, z2.s + fmla z23.s, p0/m, z15.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z14.s, z4.s + fmla z25.s, p0/m, z15.s, z4.s + + // ---------------------------------------------------------------- z14-z15: r2 = r * r + fmul z14.s, z14.s, z14.s + fmul z15.s, z15.s, z15.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z14.s, z24.s + fmla z23.s, p0/m, z15.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z14.s, z22.s + fmla z21.s, p0/m, z15.s, z23.s + + // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale + fmla z18.s, p0/m, z20.s, z18.s + fmla z19.s, p0/m, z21.s, z19.s + + // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly + dup z10.s, #0 + sel z12.s, p4, z10.s, z16.s + sel z13.s, p5, z10.s, z17.s + sel z14.s, p6, z10.s, z18.s + sel z15.s, p7, z10.s, z19.s + + // ---------------------------------------------------------------- sum in fp32 + .inst 0xc1a17d80 // fadd za.s[w11, #0, VGx4], {z12.s-z15.s} za0-za3: sum_value = sum_value + poly + + // ---------------------------------------------------------------- + // Process z28-z31 + // ---------------------------------------------------------------- + // ---------------------------------------------------------------- z16-z19: shift + mov z16.d, z5.d + mov z17.d, z5.d + mov z18.d, z5.d + mov z19.d, z5.d + + // ---------------------------------------------------------------- p4-p7: underflow = x < min_input + fcmlt p4.s, p0/z, z28.s, z9.s + fcmlt p5.s, p0/z, z29.s, z9.s + fcmlt p6.s, p0/z, z30.s, z9.s + fcmlt p7.s, p0/z, z31.s, z9.s + + // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2 + fmla z16.s, p0/m, z28.s, z6.s + fmla z17.s, p0/m, z29.s, z6.s + fmla z18.s, p0/m, z30.s, z6.s + fmla z19.s, p0/m, z31.s, z6.s + + // ---------------------------------------------------------------- z20-z23: n = z - shift + fsub z20.s, z16.s, z5.s + fsub z21.s, z17.s, z5.s + fsub z22.s, z18.s, z5.s + fsub z23.s, z19.s, z5.s + + // ---------------------------------------------------------------- z24-z27: r_hi = x + n * neg_ln2_hi + fmla z28.s, p0/m, z20.s, z7.s + fmla z29.s, p0/m, z21.s, z7.s + fmla z30.s, p0/m, z22.s, z7.s + fmla z31.s, p0/m, z23.s, z7.s + + // ---------------------------------------------------------------- z27-z30: r = r_hi + n * neg_ln2_lo + fmla z28.s, p0/m, z20.s, z8.s + fmla z29.s, p0/m, z21.s, z8.s + fmla z30.s, p0/m, z22.s, z8.s + fmla z31.s, p0/m, z23.s, z8.s + + // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n) + dup z10.s, #23 + urshl z16.s, p0/m, z16.s, z10.s + urshl z17.s, p0/m, z17.s, z10.s + urshl z18.s, p0/m, z18.s, z10.s + urshl z19.s, p0/m, z19.s, z10.s + + // Processes the first 2 vectors. (z28-z29) + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z28.s, z0.s + fmul z21.s, z29.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z28.s, z2.s + fmla z23.s, p0/m, z29.s, z2.s + + // ---------------------------------------------------------------- z24-z25: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z28.s, z4.s + fmla z25.s, p0/m, z29.s, z4.s + + // ---------------------------------------------------------------- z28-z29: r2 = r * r + fmul z28.s, z28.s, z28.s + fmul z29.s, z29.s, z29.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z28.s, z24.s + fmla z23.s, p0/m, z29.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z28.s, z22.s + fmla z21.s, p0/m, z29.s, z23.s + + // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale + fmla z16.s, p0/m, z20.s, z16.s + fmla z17.s, p0/m, z21.s, z17.s + + // Processes the last 2 vectors (z30-z31) + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z30.s, z0.s + fmul z21.s, z31.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z30.s, z2.s + fmla z23.s, p0/m, z31.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z30.s, z4.s + fmla z25.s, p0/m, z31.s, z4.s + + // ---------------------------------------------------------------- z30-z31: r2 = r * r + fmul z30.s, z30.s, z30.s + fmul z31.s, z31.s, z31.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z30.s, z24.s + fmla z23.s, p0/m, z31.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z30.s, z22.s + fmla z21.s, p0/m, z31.s, z23.s + + // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale + fmla z18.s, p0/m, z20.s, z18.s + fmla z19.s, p0/m, z21.s, z19.s + + // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly + dup z10.s, #0 + sel z28.s, p4, z10.s, z16.s + sel z29.s, p5, z10.s, z17.s + sel z30.s, p6, z10.s, z18.s + sel z31.s, p7, z10.s, z19.s + + // ---------------------------------------------------------------- sum in fp32 + .inst 0xc1a17f80 // fadd za.s[w11, #0, VGx4], {z28.s-z31.s} za0-za3: sum_value = sum_value + poly + + fcvt z12.h, p0/m, z12.s + fcvtnt z12.h, p0/m, z28.s + + fcvt z13.h, p0/m, z13.s + fcvtnt z13.h, p0/m, z29.s + + fcvt z14.h, p0/m, z14.s + fcvtnt z14.h, p0/m, z30.s + + fcvt z15.h, p0/m, z15.s + fcvtnt z15.h, p0/m, z31.s + + // Stores 4 consecutive registers to the output + .inst 0xa029a78c // st1h {z12.h-z15.h}, pn9, [x28, x9, LSL #1] + + inch x9, ALL, MUL #4 + b regularize_body_start%= +regularize_body_end%=: + + // ---------------------------------------------------------------- z28: sum_value + .inst 0xc0066c1c // mova {z28.s-z31.s}, za.s[w11, #0, VGx4] + fadd z28.s, z28.s, z29.s + fadd z30.s, z30.s, z31.s + fadd z28.s, z28.s, z30.s + + // Loop for processing the leftover part. +regularize_leftover_start%=: + whilelo p2.h, x9, %x[length] + b.none regularize_leftover_end%= + + ld1h z12.h, p2/z, [x27, x9, LSL #1] // x12: input_data + + fsub z12.h, z12.h, z11.h // z12: x = input_data - max_value + fmul z12.h, z12.h, z26.h // z12: x = (input_data - max_value) * beta + + // ---------------------------------------------------------------- z12.h --> z12.s, z13.s + fcvtlt z13.s, p2/m, z12.h + fcvt z12.s, p2/m, z12.h + + // ---------------------------------------------------------------- p3, p4: predicates for z12, z14 + pfalse p1.b + trn1 p3.h, p2.h, p1.h // for z12 + trn2 p4.h, p2.h, p1.h // for z13 + + mov z16.d, z5.d // z16: shift + mov z17.d, z5.d // z17: shift + fcmlt p5.s, p3/z, z12.s, z9.s // p5: underflow = x < min_input + fcmlt p6.s, p4/z, z13.s, z9.s // p6: underflow = x < min_input + fmla z16.s, p3/m, z12.s, z6.s // z16: z = shift + x * inv_ln2 + fmla z17.s, p4/m, z13.s, z6.s // z17: z = shift + x * inv_ln2 + fsub z20.s, z16.s, z5.s // z20: n = z - shift + fsub z21.s, z17.s, z5.s // z21: n = z - shift + fmla z12.s, p3/m, z20.s, z7.s // z12: r_hi = x + n * neg_ln2_hi + fmla z13.s, p4/m, z21.s, z7.s // z13: r_hi = x + n * neg_ln2_hi + fmla z12.s, p3/m, z20.s, z8.s // z12: r = r_hi + n * neg_ln2_lo + fmla z13.s, p4/m, z21.s, z8.s // z13: r = r_hi + n * neg_ln2_lo + dup z10.s, #23 // z10: 23 + urshl z16.s, p3/m, z16.s, z10.s // z16: scale = z << 23 (2^n) + urshl z17.s, p4/m, z17.s, z10.s // z17: scale = z << 23 (2^n) + fmul z20.s, z12.s, z0.s // z20: p1 = r * c1 + fmul z21.s, z13.s, z0.s // z21: p1 = r * c1 + mov z22.d, z1.d // z22: p23 = c2 + mov z23.d, z1.d // z23: p23 = c2 + fmla z22.s, p3/m, z12.s, z2.s // z22: p23 = c2 + r * c3 + fmla z23.s, p4/m, z13.s, z2.s // z23: p23 = c2 + r * c3 + mov z24.d, z3.d // z24: c4 + mov z25.d, z3.d // z25: c4 + fmla z24.s, p3/m, z12.s, z4.s // z24: p45 = c4 + r * c5 + fmla z25.s, p4/m, z13.s, z4.s // z25: p45 = c4 + r * c5 + fmul z12.s, z12.s, z12.s // z12: r2 = r * r + fmul z13.s, z13.s, z13.s // z13: r2 = r * r + fmla z22.s, p3/m, z12.s, z24.s // z22: p2345 = p23 + r2 * p45 + fmla z23.s, p4/m, z13.s, z25.s // z23: p2345 = p23 + r2 * p45 + fmla z20.s, p3/m, z12.s, z22.s // z20: p12345 = p1 + r2 * p2345 + fmla z21.s, p4/m, z13.s, z23.s // z21: p12345 = p1 + r2 * p2345 + fmla z16.s, p3/m, z20.s, z16.s // z16: poly = scale + p12345 * scale + fmla z17.s, p4/m, z21.s, z17.s // z17: poly = scale + p12345 * scale + dup z10.s, #0 // z10: 0 + sel z16.s, p5, z10.s, z16.s // z16: poly = underflow ? 0 : poly + sel z17.s, p6, z10.s, z17.s // z17: poly = underflow ? 0 : poly + fadd z28.s, p3/m, z28.s, z16.s // z28: sum_value = sum_value + poly + fadd z28.s, p4/m, z28.s, z17.s // z28: sum_value = sum_value + poly + + fcvt z16.h, p3/m, z16.s + fcvtnt z16.h, p4/m, z17.s + st1h z16.h, p2, [x28, x9, LSL #1] + + inch x9 + b regularize_leftover_start%= +regularize_leftover_end%=: + + // ================================================== + // Step 3: Normalize + // ================================================== + + // ---------------------------------------------------------------- z28: inv_sum_value = 1 / sum_value + faddv s28, p0, z28.s + fmov s29, #1.0 // 1.0f + fdiv s28, s29, s28 + fcvt h28, s28 + + dup z28.h, z28.h[0] + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // x9: index + +normalize_body_start%=: + cmp x9, x13 + b.eq normalize_body_end%= + + .inst 0xa009a78c // ld1h {z12.h-z15.h}, pn9/z, [x28, x9, LSL #1] + + // ---------------------------------------------------------------- z12-z15: result = x * inv_sum_value + fmul z12.h, z12.h, z28.h + fmul z13.h, z13.h, z28.h + fmul z14.h, z14.h, z28.h + fmul z15.h, z15.h, z28.h + + .inst 0xa029a78c // st1h {z12.h-z15.h}, pn9, [x28, x9, LSL #1] + + inch x9, ALL, MUL #4 + b normalize_body_start%= +normalize_body_end%=: + + // Loop for processing the leftover part. +normalize_leftover_start%=: + whilelo p1.h, x9, %x[length] + b.none normalize_leftover_end%= + + ld1h z12.h, p1/z, [x28, x9, LSL #1] // z12: x + fmul z12.h, z12.h, z28.h // z12: result = x * inv_sum_value + + st1h z12.h, p1, [x28, x9, LSL #1] + + inch x9 + b normalize_leftover_start%= +normalize_leftover_end%=: + + // ================================================== + // 3D loop closing + // ================================================== + + add x27, x27, %x[src_stride_1] + add x28, x28, %x[dst_stride_1] + b loop_1_start%= +loop_1_end%=: + + add x24, x24, %x[src_stride_2] + add x25, x25, %x[dst_stride_2] + b loop_2_start%= +loop_2_end%=: + + add x21, x21, %x[src_stride_3] + add x22, x22, %x[dst_stride_3] + b loop_3_start%= +loop_3_end%=: + + .inst 0xd503467f // smstop + )" + : + : [src] "r"(src), [dst] "r"(dst), [beta] "r"(beta), // + [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), // + [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), + [src_stride_3] "r"(src_strides[3]), // + [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), + [dst_stride_3] "r"(dst_strides[3]), // + [length] "r"(shape[0]) // + : "cc", "memory", // + "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p9", // + "x9", "x10", "x11", "x12", "x13", "x14", // + "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", // + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_fp16_softmax(const ITensor *in, + void *const, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + ARM_COMPUTE_UNUSED(axis); + + const auto *src_info = in->info(); + const auto *dst_info = out->info(); + + const auto &full_shape = dst_info->tensor_shape(); + const auto &src_strides = src_info->strides_in_bytes(); + const auto &dst_strides = dst_info->strides_in_bytes(); + + const uintptr_t k_shape[] = { + full_shape[0], + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + const uintptr_t k_src_strides[] = { + src_strides[0], + src_strides[1], + src_strides[2], + src_strides[3], + }; + + const uintptr_t k_dst_strides[] = { + dst_strides[0], + dst_strides[1], + dst_strides[2], + dst_strides[3], + }; + + const uintptr_t k_src_offset = window[0].start() * src_strides[0] + // + window[1].start() * src_strides[1] + // + window[2].start() * src_strides[2] + // + window[3].start() * src_strides[3]; + + const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + // + window[1].start() * dst_strides[1] + // + window[2].start() * dst_strides[2] + // + window[3].start() * dst_strides[3]; + + const auto *k_src = reinterpret_cast<const float16_t *>(in->buffer() + k_src_offset); + auto *k_dst = reinterpret_cast<float16_t *>(out->buffer() + k_dst_offset); + + sme2_f16_softmax_kernel(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp new file mode 100644 index 0000000000..5e29d51746 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp @@ -0,0 +1,585 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// SoftMax +// +// Steps: +// * Find max: max_value = max(src) +// * Regularize: dst[i] = exp(src[i] - max_value) +// sum_value = sum(dst) +// * Normalize: dst[i] = dst[i] / sum_value +void sme2_f32_softmax_kernel( // + const float *src, + float *dst, + float beta, + const uintptr_t shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4]) +{ + // Precondition: + // * src_strides[0] == sizeof(float) + // * dst_strides[0] == sizeof(float) + + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + + // Registers + // + // * x9: temporary, index + // * x10: temporary, -inf + // * x11: temporary, 0 + // * x12: temporary, 1.0f + // * x13: temporary, body_length + // + // * x20: index_3 + // * x21: src_3 + // * x22: dst_3 + // * x23: index_2 + // * x24: src_2 + // * x25: dst_2 + // * x26: index_1 + // * x27: src_1 + // * x28: dst_1 + // + // * z0: c1 + // * z1: c2 + // * z2: c3 + // * z3: c4 + // * z4: c5 + // * z5: shift + // * z6: inv_ln2 + // * z7: neg_ln2_hi + // * z8: neg_ln2_lo + // * z9: min_input + // * z10: 23, 0 + // * z11: max_value + // * z12-z15: x, r_hi, r, r2 + // * z16-z19: max_value, shift, z, scale, poly + // * z20-z21: n, p1, p12345 + // * z22-z23: n, p23, p2345 + // * z24-z25: p45 + // * z26: beta + // * z28-z31: sum_value + // + // * za0-za3: sum_value + // + // * p0: all-true + // * p1: left-over predicate + // * p4-p7: underflow + // * pn9: all-true + + // Prepares all constant values + + ptrue p0.b + .inst 0x25207811 // ptrue pn9.b + + mov w9, #0xfff6 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + mov w10, #0xfedb // c2: 0x1.fffdb6p-2f = 0x3efffedb + mov w11, #0xaf33 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + mov w12, #0x9f17 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + mov w13, #0x2010 // c5: 0x1.0e4020p-7f = 0x3c072010 + + movk w9, #0x3f7f, LSL #16 // c1: 0x1.ffffecp-1f = 0x3f7ffff6 + movk w10, #0x3eff, LSL #16 // c2: 0x1.fffdb6p-2f = 0x3efffedb + movk x11, #0x3e2a, LSL #16 // c3: 0x1.555e66p-3f = 0x3e2aaf33 + movk w12, #0x3d2b, LSL #16 // c4: 0x1.573e2ep-5f = 0x3d2b9f17 + movk w13, #0x3c07, LSL #16 // c5: 0x1.0e4020p-7f = 0x3c072010 + + dup z0.s, w9 // c1. + dup z1.s, w10 // c2. + dup z2.s, w11 // c3. + dup z3.s, w12 // c4. + dup z4.s, w13 // c5. + + mov w9, #0x007f // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + mov w10, #0xaa3b // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + mov w11, #0x7200 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + mov w12, #0xbe8e // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + mov w13, #0x47ae // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + + movk w9, #0x4b00, LSL #16 // shift: 2^23 + 127 = 0x1.0000fep23f = 0x4b00007f + movk w10, #0x3fb8, LSL #16 // inv_ln2: 1 / ln(2) = 0x1.715476p+0f = 0x3fb8aa3b + movk w11, #0xbf31, LSL #16 // neg_ln2_hi: -ln(2) from bits -1 to -19 = -0x1.62e400p-1f = 0xbf317200 + movk w12, #0xb5bf, LSL #16 // neg_ln2_lo: -ln(2) from bits -20 to -42 = -0x1.7f7d1cp-20f = 0xb5bfbe8e + movk w13, #0xc2ad, LSL #16 // min_input (Approximately ln 2^-125): -86.64 = 0xc2ad47ae + + dup z5.s, w9 // shift + dup z6.s, w10 // inv_ln2 + dup z7.s, w11 // neg_ln2_hi + dup z8.s, w12 // neg_ln2_lo + dup z9.s, w13 // min_input + + dup z26.s, %w[beta] // beta + + mov w10, #0x0000 // -inf: 0xff800000 + movk w10, #0xff80 // -inf: 0xff800000 + + mov w11, #0 // 0 + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cntw x13, ALL, MUL #4 + udiv x9, %x[length], x13 + mul x13, x13, x9 + + // ================================================== + // 3D loop opening + // ================================================== + + mov x20, %x[shape_3] + mov x21, %x[src] + mov x22, %x[dst] + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x20, #0 + b.eq loop_3_end%= + sub x20, x20, #1 + + mov x23, %x[shape_2] + mov x24, x21 + mov x25, x22 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x23, #0 + b.eq loop_2_end%= + sub x23, x23, #1 + + mov x26, %x[shape_1] + mov x27, x24 + mov x28, x25 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x26, #0 + b.eq loop_1_end%= + sub x26, x26, #1 + + // ================================================== + // Step 1: Find max + // ================================================== + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // x9: index + dup z11.s, w10 // z11: max_value = -inf + + // ---------------------------------------------------------------- z16-z19: max_value = -inf + mov z16.d, z11.d + mov z17.d, z11.d + mov z18.d, z11.d + mov z19.d, z11.d + +find_max_body_start%=: + cmp x9, x13 + b.eq find_max_body_end%= + + .inst 0xa009c76c // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2] // z12-z15: x + .inst 0xc1acb910 // fmax {z16.s-z19.s}, {z16.s-z19.s}, {z12.s-z15.s} // z16-z19: max_value = max(max_value, x) + + incw x9, ALL, MUL #4 + b find_max_body_start%= +find_max_body_end%=: + + // Loop for processing the leftover part. +find_max_leftover_start%=: + whilelo p1.s, x9, %x[length] + b.none find_max_leftover_end%= + + ld1w z12.s, p1/z, [x27, x9, LSL #2] // z12: x + fmax z16.s, p1/m, z16.s, z12.s // z16: max_value = max(max_value, x) + + incw x9 + b find_max_leftover_start%= +find_max_leftover_end%=: + + // ---------------------------------------------------------------- z16: max_value + .inst 0xc1b2b110 // fmax {z16.s-z17.s}, {z16.s-z17.s}, {z18.s-z19.s} + fmax z16.s, p0/m, z16.s, z17.s + fmaxv s16, p0, z16.s + + // ---------------------------------------------------------------- z11: max_value + dup z11.s, z16.s[0] + + // ================================================== + // Step 2: Regularize + // ================================================== + + .inst 0xc00800ff // zero {za0.s, za1.s, za2.s, za3.s} za0-za3: sum_value + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // ---------------------------------------------------- x9: index + +regularize_body_start%=: + cmp x9, x13 + b.eq regularize_body_end%= + + // Loads the input data to 4 consecutive registers ---------------- z12-z15: input_data + .inst 0xa009c76c // ld1w {z12.s-z15.s}, pn9/z, [x27, x9, LSL #2] + + // ---------------------------------------------------------------- z12-z15: x = input_data - max_value + fsub z12.s, z12.s, z11.s + fsub z13.s, z13.s, z11.s + fsub z14.s, z14.s, z11.s + fsub z15.s, z15.s, z11.s + + // ---------------------------------------------------------------- z12-z15: x = (input_data - max_value) * beta + fmul z12.s, z12.s, z26.s + fmul z13.s, z13.s, z26.s + fmul z14.s, z14.s, z26.s + fmul z15.s, z15.s, z26.s + + // ---------------------------------------------------------------- z16-z19: shift + mov z16.d, z5.d + mov z17.d, z5.d + mov z18.d, z5.d + mov z19.d, z5.d + + // ---------------------------------------------------------------- p4-p7: underflow = x < min_input + fcmlt p4.s, p0/z, z12.s, z9.s + fcmlt p5.s, p0/z, z13.s, z9.s + fcmlt p6.s, p0/z, z14.s, z9.s + fcmlt p7.s, p0/z, z15.s, z9.s + + // ---------------------------------------------------------------- z16-z19: z = shift + x * inv_ln2 + fmla z16.s, p0/m, z12.s, z6.s + fmla z17.s, p0/m, z13.s, z6.s + fmla z18.s, p0/m, z14.s, z6.s + fmla z19.s, p0/m, z15.s, z6.s + + // ---------------------------------------------------------------- z20-z23: n = z - shift + fsub z20.s, z16.s, z5.s + fsub z21.s, z17.s, z5.s + fsub z22.s, z18.s, z5.s + fsub z23.s, z19.s, z5.s + + // ---------------------------------------------------------------- z12-z15: r_hi = x + n * neg_ln2_hi + fmla z12.s, p0/m, z20.s, z7.s + fmla z13.s, p0/m, z21.s, z7.s + fmla z14.s, p0/m, z22.s, z7.s + fmla z15.s, p0/m, z23.s, z7.s + + // ---------------------------------------------------------------- z12-z15: r = r_hi + n * neg_ln2_lo + fmla z12.s, p0/m, z20.s, z8.s + fmla z13.s, p0/m, z21.s, z8.s + fmla z14.s, p0/m, z22.s, z8.s + fmla z15.s, p0/m, z23.s, z8.s + + // ---------------------------------------------------------------- z16-z19: scale = z << 23 (2^n) + dup z10.s, #23 + urshl z16.s, p0/m, z16.s, z10.s + urshl z17.s, p0/m, z17.s, z10.s + urshl z18.s, p0/m, z18.s, z10.s + urshl z19.s, p0/m, z19.s, z10.s + + // Processes the first 2 vectors. + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z12.s, z0.s + fmul z21.s, z13.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z12.s, z2.s + fmla z23.s, p0/m, z13.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z12.s, z4.s + fmla z25.s, p0/m, z13.s, z4.s + + // ---------------------------------------------------------------- z12-z13: r2 = r * r + fmul z12.s, z12.s, z12.s + fmul z13.s, z13.s, z13.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z12.s, z24.s + fmla z23.s, p0/m, z13.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z12.s, z22.s + fmla z21.s, p0/m, z13.s, z23.s + + // ---------------------------------------------------------------- z16-z17: poly = scale + p12345 * scale + fmla z16.s, p0/m, z20.s, z16.s + fmla z17.s, p0/m, z21.s, z17.s + + // Processes the last 2 vectors + + // ---------------------------------------------------------------- z20-z21: p1 = r * c1 + fmul z20.s, z14.s, z0.s + fmul z21.s, z15.s, z0.s + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + mov z22.d, z1.d + mov z23.d, z1.d + + // ---------------------------------------------------------------- z22-z23: p23 = c2 + r * c3 + fmla z22.s, p0/m, z14.s, z2.s + fmla z23.s, p0/m, z15.s, z2.s + + // ---------------------------------------------------------------- z24-z35: c4 + mov z24.d, z3.d + mov z25.d, z3.d + + // ---------------------------------------------------------------- z24-z25: p45 = c4 + r * c5 + fmla z24.s, p0/m, z14.s, z4.s + fmla z25.s, p0/m, z15.s, z4.s + + // ---------------------------------------------------------------- z14-z15: r2 = r * r + fmul z14.s, z14.s, z14.s + fmul z15.s, z15.s, z15.s + + // ---------------------------------------------------------------- z22-z23: p2345 = p23 + r2 * p45 + fmla z22.s, p0/m, z14.s, z24.s + fmla z23.s, p0/m, z15.s, z25.s + + // ---------------------------------------------------------------- z20-z21: p12345 = p1 + r2 * p2345 + fmla z20.s, p0/m, z14.s, z22.s + fmla z21.s, p0/m, z15.s, z23.s + + // ---------------------------------------------------------------- z18-z19: poly = scale + p12345 * scale + fmla z18.s, p0/m, z20.s, z18.s + fmla z19.s, p0/m, z21.s, z19.s + + // ---------------------------------------------------------------- z16-z19: poly = underflow ? 0 : poly + dup z10.s, #0 + sel z16.s, p4, z10.s, z16.s + sel z17.s, p5, z10.s, z17.s + sel z18.s, p6, z10.s, z18.s + sel z19.s, p7, z10.s, z19.s + + // Stores 4 consecutive registers to the output + .inst 0xa029c790 // st1w {z16.s-z19.s}, pn9, [x28, x9, LSL #2] + + .inst 0xc1a17e00 // fadd za.s[w11, #0, VGx4], {z16.s-z19.s} za0-za3: sum_value = sum_value + poly + + incw x9, ALL, MUL #4 + b regularize_body_start%= +regularize_body_end%=: + + // ---------------------------------------------------------------- z28: sum_value + .inst 0xc0066c1c // mova {z28.s-z31.s}, za.s[w11, #0, VGx4] + fadd z28.s, z28.s, z29.s + fadd z30.s, z30.s, z31.s + fadd z28.s, z28.s, z30.s + + // Loop for processing the leftover part. +regularize_leftover_start%=: + whilelo p1.s, x9, %x[length] + b.none regularize_leftover_end%= + + ld1w z12.s, p1/z, [x27, x9, LSL #2] // x12: input_data + + fsub z12.s, z12.s, z11.s // z12: x = input_data - max_value + fmul z12.s, z12.s, z26.s // z12: x = (input_data - max_value) * beta + + mov z16.d, z5.d // z16: shift + fcmlt p4.s, p1/z, z12.s, z9.s // p4: underflow = x < min_input + fmla z16.s, p1/m, z12.s, z6.s // z16: z = shift + x * inv_ln2 + fsub z20.s, z16.s, z5.s // z20: n = z - shift + fmla z12.s, p1/m, z20.s, z7.s // z12: r_hi = x + n * neg_ln2_hi + fmla z12.s, p1/m, z20.s, z8.s // z12: r = r_hi + n * neg_ln2_lo + dup z10.s, #23 // z10: 23 + urshl z16.s, p1/m, z16.s, z10.s // z16: scale = z << 23 (2^n) + fmul z20.s, z12.s, z0.s // z20: p1 = r * c1 + mov z22.d, z1.d // z22: p23 = c2 + fmla z22.s, p1/m, z12.s, z2.s // z22: p23 = c2 + r * c3 + mov z24.d, z3.d // z24: c4 + fmla z24.s, p1/m, z12.s, z4.s // z24: p45 = c4 + r * c5 + fmul z12.s, z12.s, z12.s // z12: r2 = r * r + fmla z22.s, p1/m, z12.s, z24.s // z22: p2345 = p23 + r2 * p45 + fmla z20.s, p1/m, z12.s, z22.s // z20: p12345 = p1 + r2 * p2345 + fmla z16.s, p1/m, z20.s, z16.s // z16: poly = scale + p12345 * scale + dup z10.s, #0 // z10: 0 + sel z16.s, p4, z10.s, z16.s // z16: poly = underflow ? 0 : poly + + st1w z16.s, p1, [x28, x9, LSL #2] + + fadd z28.s, p1/m, z28.s, z16.s // z28: sum_value = sum_value + poly + + incw x9 + b regularize_leftover_start%= +regularize_leftover_end%=: + + // ================================================== + // Step 3: Normalize + // ================================================== + + // ---------------------------------------------------------------- z28: inv_sum_value = 1 / sum_value + fmov s29, #1.0 // 1.0f + faddv s28, p0, z28.s + fdiv s28, s29, s28 + dup z28.s, z28.s[0] + + // Loop for processing 4 vectors per iteration. + mov x9, #0 // x9: index + +normalize_body_start%=: + cmp x9, x13 + b.eq normalize_body_end%= + + .inst 0xa009c78c // ld1w {z12.s-z15.s}, pn9/z, [x28, x9, LSL #2] // z12-z15: x + + // ---------------------------------------------------------------- z12-z15: result = x * inv_sum_value + fmul z12.s, z12.s, z28.s + fmul z13.s, z13.s, z28.s + fmul z14.s, z14.s, z28.s + fmul z15.s, z15.s, z28.s + + .inst 0xa029c78c // st1w {z12.s-z15.s}, pn9, [x28, x9, LSL #2] + + incw x9, ALL, MUL #4 + b normalize_body_start%= +normalize_body_end%=: + + // Loop for processing the leftover part. +normalize_leftover_start%=: + whilelo p1.s, x9, %x[length] + b.none normalize_leftover_end%= + + ld1w z12.s, p1/z, [x28, x9, LSL #2] // z12: x + fmul z12.s, z12.s, z28.s // z12: result = x * inv_sum_value + + st1w z12.s, p1, [x28, x9, LSL #2] + + incw x9 + b normalize_leftover_start%= +normalize_leftover_end%=: + + // ================================================== + // 3D loop closing + // ================================================== + + add x27, x27, %x[src_stride_1] + add x28, x28, %x[dst_stride_1] + b loop_1_start%= +loop_1_end%=: + + add x24, x24, %x[src_stride_2] + add x25, x25, %x[dst_stride_2] + b loop_2_start%= +loop_2_end%=: + + add x21, x21, %x[src_stride_3] + add x22, x22, %x[dst_stride_3] + b loop_3_start%= +loop_3_end%=: + + .inst 0xd503467f // smstop + )" + : + : [src] "r"(src), [dst] "r"(dst), [beta] "r"(beta), // + [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), // + [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), + [src_stride_3] "r"(src_strides[3]), // + [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), + [dst_stride_3] "r"(dst_strides[3]), // + [length] "r"(shape[0]) // + : "cc", "memory", // + "p0", "p4", "p5", "p6", "p7", "p9", // + "x9", "x10", "x11", "x12", "x13", // + "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", // + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_fp32_softmax(const ITensor *in, + void *const, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(lut_ptr); + ARM_COMPUTE_UNUSED(axis); + + const auto *src_info = in->info(); + const auto *dst_info = out->info(); + + const auto &full_shape = dst_info->tensor_shape(); + const auto &src_strides = src_info->strides_in_bytes(); + const auto &dst_strides = dst_info->strides_in_bytes(); + + const uintptr_t k_shape[] = { + full_shape[0], + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + const uintptr_t k_src_strides[] = { + src_strides[0], + src_strides[1], + src_strides[2], + src_strides[3], + }; + + const uintptr_t k_dst_strides[] = { + dst_strides[0], + dst_strides[1], + dst_strides[2], + dst_strides[3], + }; + + const uintptr_t k_src_offset = window[0].start() * src_strides[0] + // + window[1].start() * src_strides[1] + // + window[2].start() * src_strides[2] + // + window[3].start() * src_strides[3]; + + const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + // + window[1].start() * dst_strides[1] + // + window[2].start() * dst_strides[2] + // + window[3].start() * dst_strides[3]; + + const auto *k_src = reinterpret_cast<const float *>(in->buffer() + k_src_offset); + auto *k_dst = reinterpret_cast<float *>(out->buffer() + k_dst_offset); + + sme2_f32_softmax_kernel(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp new file mode 100644 index 0000000000..9feb669f7c --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp @@ -0,0 +1,634 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// SoftMax +// +// Steps: +// * Find max: max_value = max(src) +// * Regularize: dst[i] = exp(src[i] - max_value) +// sum_value = sum(dst) +// * Normalize: dst[i] = dst[i] / sum_value +void sme2_qasymm8_softmax_kernel_512VL( // + const uint8_t *src, + uint8_t *dst, + float beta, + const uintptr_t shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + const float *lut, + float *tmp) +{ + // Precondition: + // * src_strides[0] == sizeof(uint8_t) + // * dst_strides[0] == sizeof(uint8_t) + // * tmp_strides[0] == sizeof(float) + + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + + // Registers + // + // * x1: Loop index + // * x2: LUT index + // * x13: temporary, body_length + // + // * x20: index_3 + // * x21: src_3 + // * x22: dst_3 + // * x23: index_2 + // * x24: src_2 + // * x25: dst_2 + // * x26: index_1 + // * x27: src_1 + // * x28: dst_1 + // * x29 tmp + // + // + // * p0: all-true + // * p1: predicate for QASYMM8 values + // * p2: predicate 0 for FP32 values (first quarter of expanded/unpacked p1) + // * p3: predicate 1 for FP32 values (second quarter of expanded/unpacked p1) + // * p4: predicate 2 for FP32 values (third quarter of expanded/unpacked p1) + // * p5: predicate 3 for FP32 values (fourth quarter of expanded/unpacked p1) + // * pn9: all-true for 32 bit values + // * pn8: all-true for 8-bit values + // + // * z0-z15 the 256 LUT values of exp(-scale*beta*x) for x in QASYMM8, stored as FP32 values + + // Prepares all constant values + + ptrue p0.b + .inst 0x25a07811 // ptrue pn9.s + .inst 0x25207810 // ptrue pn8.b + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cntb x13, ALL, MUL #4 + udiv x9, %x[length], x13 + mul x13, x13, x9 + + // ================================================== + // 3D loop opening + // ================================================== + + mov x20, %x[shape_3] + mov x21, %x[src] + mov x22, %x[dst] + mov x19, %x[lut] + mov x29, %x[tmp] + + // Load the LUT to the register file. + mov x2, %x[lut] + .inst 0xa040c440 //ld1w { z0.s - z3.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c444 //ld1w { z4.s - z7.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c448 //ld1w { z8.s - z11.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c44c //ld1w { z12.s - z15.s }, pn9/z, [x2] + + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x20, #0 + b.eq loop_3_end%= + sub x20, x20, #1 + + mov x23, %x[shape_2] + mov x24, x21 + mov x25, x22 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x23, #0 + b.eq loop_2_end%= + sub x23, x23, #1 + + mov x26, %x[shape_1] + mov x27, x24 + mov x28, x25 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x26, #0 + b.eq loop_1_end%= + sub x26, x26, #1 + + // ================================================== + // Step 1: Find max + // ================================================== + // z16-z19 = minimum QASYMM8 value (0) to allow for it to be used for comparison to find the max. + dup z16.b, #0 + dup z17.b, #0 + dup z18.b, #0 + dup z19.b, #0 + mov x1, #0 // x1: index +find_max_body_start%=: + cmp x1, x13 + b.eq find_max_body_end%= + .inst 0xa0018374 // ld1b { z20.b - z23.b }, pn8/z, [x27, x1] z20-z23: x + .inst 0xc134b811 // umax { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x) + add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers. + b find_max_body_start%= +find_max_body_end%=: + + // Loop for processing the leftover part. +find_max_leftover_start%=: + whilelo p1.b, x1, %x[length] + b.none find_max_leftover_end%= + + ld1b z30.b, p1/z, [x27, x1] // z30: x + umax z16.b, p1/m, z16.b, z30.b // z16: max_value = max(max_value, x) + + add x1, x1, #64 + + b find_max_leftover_start%= +find_max_leftover_end%=: + + .inst 0xc132b011 // umax { z16.b, z17.b }, { z16.b, z17.b }, { z18.b, z19.b } + umax z16.b, p0/m, z16.b, z17.b + umaxv b16, p0, z16.b // Reduction unsigned max operation to get maximum_value + dup z16.b, z16.b[0] + uunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction + uunpklo z16.s, z16.h + + mov x1, #0 // reset index + dup z25.s, #0 + + mov x1, #0 + +regularize_start%=: + whilelo p1.b, x1, %x[length] + b.none regularize_end%= + + // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated + punpklo p2.h, p1.b + punpkhi p4.h, p1.b + + punpkhi p3.h, p2.b + punpklo p2.h, p2.b + + punpkhi p5.h, p4.b + punpklo p4.h, p4.b + + ld1b z17.b, p1/z, [x27, x1] //z17: input data + + uunpklo z18.h, z17.b //Using unpack instructions to align the input QASYMM8 values with the FP32 entries in the LUT for use in the TBX instruction + uunpkhi z19.h, z17.b + + uunpklo z17.s, z18.h // z17 = low low input QASYMM8 values + uunpkhi z18.s, z18.h // z18 = low high input QASYMM8 values + + uunpkhi z20.s, z19.h // z20 = high high input QASYMM8 values + uunpklo z19.s, z19.h // z19 = high low input QASYMM8 values + + sub z17.s, z16.s, z17.s // z12: x = max_value - input_data + sub z18.s, z16.s, z18.s // z13: x = max_value - input_data + sub z19.s, z16.s, z19.s // z14: x = max_value - input_data + sub z20.s, z16.s, z20.s // z15: x = max_value - input_data + + tbx z21.s, z0.s, z17.s // Look-up entries 0-15 in the LUT. + tbx z22.s, z0.s, z18.s + tbx z23.s, z0.s, z19.s + tbx z24.s, z0.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z1.s, z17.s // Look-up entries 16-31 in the LUT. + tbx z22.s, z1.s, z18.s + tbx z23.s, z1.s, z19.s + tbx z24.s, z1.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z2.s, z17.s // Look-up entries 32-47 in the LUT. + tbx z22.s, z2.s, z18.s + tbx z23.s, z2.s, z19.s + tbx z24.s, z2.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z3.s, z17.s // Look-up entries 48-63 in the LUT. + tbx z22.s, z3.s, z18.s + tbx z23.s, z3.s, z19.s + tbx z24.s, z3.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z4.s, z17.s // Look-up entries 64-79 in the LUT. + tbx z22.s, z4.s, z18.s + tbx z23.s, z4.s, z19.s + tbx z24.s, z4.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z5.s, z17.s // Look-up entries 80-95 in the LUT. + tbx z22.s, z5.s, z18.s + tbx z23.s, z5.s, z19.s + tbx z24.s, z5.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z6.s, z17.s // Look-up entries 96-111 in the LUT. + tbx z22.s, z6.s, z18.s + tbx z23.s, z6.s, z19.s + tbx z24.s, z6.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z7.s, z17.s // Look-up entries 112-127 in the LUT. + tbx z22.s, z7.s, z18.s + tbx z23.s, z7.s, z19.s + tbx z24.s, z7.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z8.s, z17.s // Look-up entries 128-143 in the LUT. + tbx z22.s, z8.s, z18.s + tbx z23.s, z8.s, z19.s + tbx z24.s, z8.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z9.s, z17.s // Look-up entries 144-159 in the LUT. + tbx z22.s, z9.s, z18.s + tbx z23.s, z9.s, z19.s + tbx z24.s, z9.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z10.s, z17.s // Look-up entries 160-175 in the LUT. + tbx z22.s, z10.s, z18.s + tbx z23.s, z10.s, z19.s + tbx z24.s, z10.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z11.s, z17.s // Look-up entries 176-191 in the LUT. + tbx z22.s, z11.s, z18.s + tbx z23.s, z11.s, z19.s + tbx z24.s, z11.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z12.s, z17.s // Look-up entries 192-207 in the LUT. + tbx z22.s, z12.s, z18.s + tbx z23.s, z12.s, z19.s + tbx z24.s, z12.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z13.s, z17.s // Look-up entries 208-223 in the LUT. + tbx z22.s, z13.s, z18.s + tbx z23.s, z13.s, z19.s + tbx z24.s, z13.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z14.s, z17.s // Look-up entries 224-239 in the LUT. + tbx z22.s, z14.s, z18.s + tbx z23.s, z14.s, z19.s + tbx z24.s, z14.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z15.s, z17.s // Look-up entries 240-255 in the LUT. + tbx z22.s, z15.s, z18.s + tbx z23.s, z15.s, z19.s + tbx z24.s, z15.s, z20.s + + + st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p2/m, z25.s, z21.s + add x1, x1, #16 + + st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p3/m, z25.s, z22.s + add x1, x1, #16 + + st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p4/m, z25.s, z23.s + add x1, x1, #16 + + st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p5/m, z25.s, z24.s + add x1, x1, #16 + + b regularize_start%= +regularize_end%=: + + mov w9, 0x0000 + movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [0,255] integer range of QASYMM8 + dup z29.s, w9 + faddv s25, p0, z25.s + fdiv s25, s29, s25 + dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax. + + // ================================================== + // Step 3: Normalize + // ================================================== + mov x1, #0 +normalize_body_start%=: + cmp x1, x13 + b.eq normalize_body_end%= + + mov x2, x1 // Preserve the index into x2 for the final store to dst. + .inst 0xa001c7b0 // ld1w { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + .inst 0xa001c7b4 // ld1w { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + + // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z16.s, z25.s, z16.s + fmul z17.s, z25.s, z17.s + fmul z18.s, z25.s, z18.s + fmul z19.s, z25.s, z19.s + fmul z20.s, z25.s, z20.s + fmul z21.s, z25.s, z21.s + fmul z22.s, z25.s, z22.s + fmul z23.s, z25.s, z23.s + + // z16-z23: convert the FP32 values from the tmp tensor to uint32. + fcvtzu z16.s, p0/m, z16.s + fcvtzu z17.s, p0/m, z17.s + fcvtzu z18.s, p0/m, z18.s + fcvtzu z19.s, p0/m, z19.s + fcvtzu z20.s, p0/m, z20.s + fcvtzu z21.s, p0/m, z21.s + fcvtzu z22.s, p0/m, z22.s + fcvtzu z23.s, p0/m, z23.s + + // z16-z17: narrow the uint32 values into uint8 and saturate them. + .inst 0xc133e230 // uqcvt z16.b, { z16.s - z19.s } + .inst 0xc133e2b1 // uqcvt z17.b, { z20.s - z23.s } + + dup z20.s, z25.s[0] // Juggling the value to z20 as z25 will be overwritten by the load below + + .inst 0xa001c7b8 // ld1w { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + .inst 0xa001c7bc // ld1w { z28.s - z31.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + + // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z24.s, z20.s, z24.s + fmul z25.s, z20.s, z25.s + fmul z26.s, z20.s, z26.s + fmul z27.s, z20.s, z27.s + fmul z28.s, z20.s, z28.s + fmul z29.s, z20.s, z29.s + fmul z30.s, z20.s, z30.s + fmul z31.s, z20.s, z31.s + + // z24-z31: convert the FP32 values from the tmp tensor to uint32. + fcvtzu z24.s, p0/m, z24.s + fcvtzu z25.s, p0/m, z25.s + fcvtzu z26.s, p0/m, z26.s + fcvtzu z27.s, p0/m, z27.s + fcvtzu z28.s, p0/m, z28.s + fcvtzu z29.s, p0/m, z29.s + fcvtzu z30.s, p0/m, z30.s + fcvtzu z31.s, p0/m, z31.s + + // z18-z19: narrow the uint32 values into uint8 and saturate them. + .inst 0xc133e332 // uqcvt z18.b, { z24.s - z27.s } + .inst 0xc133e3b3 // uqcvt z19.b, { z28.s - z31.s } + + .inst 0xa0228390 // st1b { z16.b - z19.b }, pn8, [x28, x2] + + dup z25.s, z20.s[0] // Juggling the value back to z25 as z20 will be overwritten by the next iteration or z25 will be used below. + +b normalize_body_start%= +normalize_body_end%=: + +normalize_leftover_start%=: + whilelo p1.b, x1, %x[length] + b.none normalize_leftover_end%= + + // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated + punpklo p2.h, p1.b + punpkhi p4.h, p1.b + + punpkhi p3.h, p2.b + punpklo p2.h, p2.b + + punpkhi p5.h, p4.b + punpklo p4.h, p4.b + + mov x2, x1 // Preserve the index into x2 for the final store to dst. + + // z20-z23: load exp(-scale*beta*x) from the tmp tensor + ld1w z20.s, p2/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z21.s, p3/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z22.s, p4/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z23.s, p5/z, [x29, x1, LSL #2] + add x1, x1, #16 + + // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z20.s, z25.s, z20.s + fmul z21.s, z25.s, z21.s + fmul z22.s, z25.s, z22.s + fmul z23.s, z25.s, z23.s + + // z20-23: convert the FP32 values from the tmp tensor to uint32. + fcvtzu z20.s, p0/m, z20.s + fcvtzu z21.s, p0/m, z21.s + fcvtzu z22.s, p0/m, z22.s + fcvtzu z23.s, p0/m, z23.s + + .inst 0xc133e2b3 // uqcvt z19.b, { z20.s - z23.s }, narrow the uint32 values into uint8 and saturate them into z19. + + st1b z19.b, p1, [x28, x2] + + b normalize_leftover_start%= +normalize_leftover_end%=: + // ================================================== + // 3D loop closing + // ================================================== + add x27, x27, %x[src_stride_1] + add x28, x28, %x[dst_stride_1] + b loop_1_start%= +loop_1_end%=: + + add x24, x24, %x[src_stride_2] + add x25, x25, %x[dst_stride_2] + b loop_2_start%= +loop_2_end%=: + + add x21, x21, %x[src_stride_3] + add x22, x22, %x[dst_stride_3] + b loop_3_start%= +loop_3_end%=: + .inst 0xd503467f // smstop + )" + : + : [src] "r"(src), [tmp] "r"(tmp), [dst] "r"(dst), [beta] "r"(beta), [lut] "r"(lut), // + [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), // + [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), + [src_stride_3] "r"(src_strides[3]), // + [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), + [dst_stride_3] "r"(dst_strides[3]), // + [length] "r"(shape[0]) // + : "cc", "memory", // + "p0", "p1", "p2", "p3", "p4", // + "x2", "x9", "x13", // + "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x19", // + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(axis); + + const auto *src_info = in->info(); + const auto *dst_info = out->info(); + + const auto &full_shape = dst_info->tensor_shape(); + const auto &src_strides = src_info->strides_in_bytes(); + const auto &dst_strides = dst_info->strides_in_bytes(); + Strides tmp_strides; + + tmp_strides[0] = src_strides[0] * 4; + tmp_strides[1] = src_strides[1] * 4; + tmp_strides[2] = src_strides[2] * 4; + tmp_strides[3] = src_strides[3] * 4; + + const uintptr_t k_shape[] = { + full_shape[0], + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + const uintptr_t k_src_strides[] = { + src_strides[0], + src_strides[1], + src_strides[2], + src_strides[3], + }; + + const uintptr_t k_dst_strides[] = { + dst_strides[0], + dst_strides[1], + dst_strides[2], + dst_strides[3], + }; + + const uintptr_t k_src_offset = window[0].start() * src_strides[0] + // + window[1].start() * src_strides[1] + // + window[2].start() * src_strides[2] + // + window[3].start() * src_strides[3]; + + const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + // + window[1].start() * dst_strides[1] + // + window[2].start() * dst_strides[2] + // + window[3].start() * dst_strides[3]; + + const uintptr_t k_tmp_offset = window[0].start() * tmp_strides[0] + // + window[1].start() * tmp_strides[1] + // + window[2].start() * tmp_strides[2] + // + window[3].start() * tmp_strides[3]; + + const auto *k_src = reinterpret_cast<const uint8_t *>(in->buffer() + k_src_offset); + float *tmp_float_ptr = reinterpret_cast<float *>(tmp); + auto *k_tmp = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset); + auto *k_dst = reinterpret_cast<uint8_t *>(out->buffer() + k_dst_offset); + + sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp new file mode 100644 index 0000000000..14c0f6c327 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace cpu +{ + +// SoftMax +// +// Steps: +// * Find max: max_value = max(src) +// * Regularize: dst[i] = exp(src[i] - max_value) +// sum_value = sum(dst) +// * Normalize: dst[i] = dst[i] / sum_value +void sme2_qasymm8_signed_softmax_kernel_512VL( // + const int8_t *src, + int8_t *dst, + float beta, + const uintptr_t shape[4], + const uintptr_t src_strides[4], + const uintptr_t dst_strides[4], + const float *lut, + float *tmp) +{ + // Precondition: + // * src_strides[0] == sizeof(int8_t) + // * dst_strides[0] == sizeof(int8_t) + // * tmp_strides[0] == sizeof(float) + + __asm__ volatile( + R"( + .inst 0xd503477f // smstart + + // For register list explanation refer to qasymm8.cpp. + + // Prepares all constant values + + ptrue p0.b + .inst 0x25a07811 // ptrue pn9.s + .inst 0x25207810 // ptrue pn8.b + + // ---------------------------------------------------------------- x13: body_length = (length / vl) * vl + cntb x13, ALL, MUL #4 + udiv x9, %x[length], x13 + mul x13, x13, x9 + + // ================================================== + // 3D loop opening + // ================================================== + + mov x20, %x[shape_3] + mov x21, %x[src] + mov x22, %x[dst] + mov x19, %x[lut] + mov x29, %x[tmp] + + // Load the LUT to the register file. + mov x2, %x[lut] + .inst 0xa040c440 //ld1w { z0.s - z3.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c444 //ld1w { z4.s - z7.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c448 //ld1w { z8.s - z11.s }, pn9/z, [x2] + add x2, x2, #256 + .inst 0xa040c44c //ld1w { z12.s - z15.s }, pn9/z, [x2] + + +loop_3_start%=: + // for index_3 in shape_3 downto 1 + cmp x20, #0 + b.eq loop_3_end%= + sub x20, x20, #1 + + mov x23, %x[shape_2] + mov x24, x21 + mov x25, x22 + +loop_2_start%=: + // for index_2 in shape_2 downto 1 + cmp x23, #0 + b.eq loop_2_end%= + sub x23, x23, #1 + + mov x26, %x[shape_1] + mov x27, x24 + mov x28, x25 + +loop_1_start%=: + // for index_1 in shape_2 downto 1 + cmp x26, #0 + b.eq loop_1_end%= + sub x26, x26, #1 + + // ================================================== + // Step 1: Find max + // ================================================== + // z16-z19 = minimum QASYMM8_SIGNED value (-128) to allow for it to be used for comparison to find the max. + dup z16.b, #0x80 + dup z17.b, #0x80 + dup z18.b, #0x80 + dup z19.b, #0x80 + + mov x1, #0 // x1: index +find_max_body_start%=: + cmp x1, x13 + b.eq find_max_body_end%= + .inst 0xa0018374 // ld1b { z20.b - z23.b }, pn8/z, [x27, x1] z16-z19: x + .inst 0xc134b810 // smax { z16.b - z19.b }, { z16.b - z19.b }, { z20.b - z23.b } z16-z19: max_value = max(max_value, x) + add x1, x1, #256 // Advance index by 256 bytes/integers: Z registers = 2048-bit data = 256 8-bit integers. + b find_max_body_start%= +find_max_body_end%=: + + // Loop for processing the leftover part. +find_max_leftover_start%=: + whilelo p1.b, x1, %x[length] + b.none find_max_leftover_end%= + + ld1b z30.b, p1/z, [x27, x1] // z30: x + smax z16.b, p1/m, z16.b, z30.b // z16: max_value = max(max_value, x) + + add x1, x1, #64 + + b find_max_leftover_start%= +find_max_leftover_end%=: + .inst 0xc132b010 // smax { z16.b, z17.b }, { z16.b, z17.b }, { z18.b, z19.b } + smax z16.b, p0/m, z16.b, z17.b + smaxv b16, p0, z16.b // Reduction signed max operation to get maximum_value + mov z16.b, b16 // z16: duplicated max_value for current row + + sunpklo z16.h, z16.b // Using unpack instructions to align the max value with the FP32 entries in the LUT for use in the TBX instruction + sunpklo z16.s, z16.h + + mov x1, #0 // reset index + dup z25.s, #0 + + +regularize_start%=: + whilelo p1.b, x1, %x[length] + b.none regularize_end%= + + mov w9, 0xFF80 + movk w9, 0xFFFF, LSL #16 // Moving -127.f into w9 to set the registers below to the minimum QASYMM8_SIGNED value + dup z17.s, w9 + dup z18.s, w9 + dup z19.s, w9 + dup z20.s, w9 + + dup z21.s, #0x0 + dup z22.s, #0x0 + dup z23.s, #0x0 + dup z24.s, #0x0 + + // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated + punpklo p2.h, p1.b + punpkhi p4.h, p1.b + + punpkhi p3.h, p2.b + punpklo p2.h, p2.b + + punpkhi p5.h, p4.b + punpklo p4.h, p4.b + + ld1b z17.b, p1/z, [x27, x1] //z17: input data + + sunpklo z18.h, z17.b // Using unpack instructions to align the input QASYMM8_SIGNED values with the FP32 entries in the LUT for use in the TBX instruction + sunpkhi z19.h, z17.b // + + sunpklo z17.s, z18.h // z17 = low low input QASYMM8_SIGNED values + sunpkhi z18.s, z18.h // z18 = low high input QASYMM8_SIGNED values + + sunpkhi z20.s, z19.h // z20 = high high input QASYMM8_SIGNED values + sunpklo z19.s, z19.h // z19 = high low input QASYMM8_SIGNED values + + sub z17.s, z16.s, z17.s // z12: x = max_value - input_data + sub z18.s, z16.s, z18.s // z13: x = max_value - input_data + sub z19.s, z16.s, z19.s // z14: x = max_value - input_data + sub z20.s, z16.s, z20.s // z15: x = max_value - input_data + + add z17.s, z17.s, #128 + add z18.s, z18.s, #128 + add z19.s, z19.s, #128 + add z20.s, z20.s, #128 + + tbx z21.s, z0.s, z17.s // Look-up entries 0-15 in the LUT. + tbx z22.s, z0.s, z18.s + tbx z23.s, z0.s, z19.s + tbx z24.s, z0.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z1.s, z17.s // Look-up entries 16-31 in the LUT. + tbx z22.s, z1.s, z18.s + tbx z23.s, z1.s, z19.s + tbx z24.s, z1.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z2.s, z17.s // Look-up entries 32-47 in the LUT. + tbx z22.s, z2.s, z18.s + tbx z23.s, z2.s, z19.s + tbx z24.s, z2.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z3.s, z17.s // Look-up entries 48-63 in the LUT. + tbx z22.s, z3.s, z18.s + tbx z23.s, z3.s, z19.s + tbx z24.s, z3.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z4.s, z17.s // Look-up entries 64-79 in the LUT. + tbx z22.s, z4.s, z18.s + tbx z23.s, z4.s, z19.s + tbx z24.s, z4.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z5.s, z17.s // Look-up entries 80-95 in the LUT. + tbx z22.s, z5.s, z18.s + tbx z23.s, z5.s, z19.s + tbx z24.s, z5.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z6.s, z17.s // Look-up entries 96-111 in the LUT. + tbx z22.s, z6.s, z18.s + tbx z23.s, z6.s, z19.s + tbx z24.s, z6.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z7.s, z17.s // Look-up entries 112-127 in the LUT. + tbx z22.s, z7.s, z18.s + tbx z23.s, z7.s, z19.s + tbx z24.s, z7.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z8.s, z17.s // Look-up entries 128-143 in the LUT. + tbx z22.s, z8.s, z18.s + tbx z23.s, z8.s, z19.s + tbx z24.s, z8.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z9.s, z17.s // Look-up entries 144-159 in the LUT. + tbx z22.s, z9.s, z18.s + tbx z23.s, z9.s, z19.s + tbx z24.s, z9.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z10.s, z17.s // Look-up entries 160-175 in the LUT. + tbx z22.s, z10.s, z18.s + tbx z23.s, z10.s, z19.s + tbx z24.s, z10.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z11.s, z17.s // Look-up entries 176-191 in the LUT. + tbx z22.s, z11.s, z18.s + tbx z23.s, z11.s, z19.s + tbx z24.s, z11.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z12.s, z17.s // Look-up entries 192-207 in the LUT. + tbx z22.s, z12.s, z18.s + tbx z23.s, z12.s, z19.s + tbx z24.s, z12.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z13.s, z17.s // Look-up entries 208-223 in the LUT. + tbx z22.s, z13.s, z18.s + tbx z23.s, z13.s, z19.s + tbx z24.s, z13.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z14.s, z17.s // Look-up entries 224-239 in the LUT. + tbx z22.s, z14.s, z18.s + tbx z23.s, z14.s, z19.s + tbx z24.s, z14.s, z20.s + + sub z17.s, z17.s, #16 + sub z18.s, z18.s, #16 + sub z19.s, z19.s, #16 + sub z20.s, z20.s, #16 + + tbx z21.s, z15.s, z17.s // Look-up entries 240-255 in the LUT. + tbx z22.s, z15.s, z18.s + tbx z23.s, z15.s, z19.s + tbx z24.s, z15.s, z20.s + + + st1w z21.s, p2, [x29, x1, LSL #2]// z21 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p2/m, z25.s, z21.s + add x1, x1, #16 + + st1w z22.s, p3, [x29, x1, LSL #2]// z22 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p3/m, z25.s, z22.s + add x1, x1, #16 + + st1w z23.s, p4, [x29, x1, LSL #2]// z23 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p4/m, z25.s, z23.s + add x1, x1, #16 + + st1w z24.s, p5, [x29, x1, LSL #2]// z24 store exp(-scale*beta*x) into the tmp tensor + fadd z25.s, p5/m, z25.s, z24.s + add x1, x1, #16 + + b regularize_start%= +regularize_end%=: + + mov w9, 0x0000 + movk w9, 0x4380, LSL #16 // Moving 256.f into w9 to scale - via multiplication (division by reciprocal) - the floating point [0,1] range of the results to the [-128, 127] integer range of QASYMM8_SIGNED + mov w10, 0x0000 + movk w10, 0x4300, LSL #16 // Moving 128.f into w10 for the subtraction to move the results - via subtraction - from the [0,255] range to the [-128,127] range + dup z29.s, w9 + dup z30.s, w10 + faddv s25, p0, z25.s + fdiv s25, s29, s25 + dup z25.s, z25.s[0] // z25: 256.f/sum. 256 is needed to get the full range and 1/sum is part of softmax. + + // ================================================== + // Step 3: Normalize + // ================================================== + mov x1, #0 +normalize_body_start%=: + cmp x1, x13 + b.eq normalize_body_end%= + + mov x2, x1 // Preserve the index into x2 for the final store to dst. + .inst 0xa001c7b0 // ld1w { z16.s - z19.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + .inst 0xa001c7b4 // ld1w { z20.s - z23.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + + // z16-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z16.s, z25.s, z16.s + fmul z17.s, z25.s, z17.s + fmul z18.s, z25.s, z18.s + fmul z19.s, z25.s, z19.s + fmul z20.s, z25.s, z20.s + fmul z21.s, z25.s, z21.s + fmul z22.s, z25.s, z22.s + fmul z23.s, z25.s, z23.s + + // z16-z23: subtract 128.f. + fsub z16.s, z16.s, z30.s // Subtract 128.f + fsub z17.s, z17.s, z30.s // Subtract 128.f + fsub z18.s, z18.s, z30.s // Subtract 128.f + fsub z19.s, z19.s, z30.s // Subtract 128.f + fsub z20.s, z20.s, z30.s // Subtract 128.f + fsub z21.s, z21.s, z30.s // Subtract 128.f + fsub z22.s, z22.s, z30.s // Subtract 128.f + fsub z23.s, z23.s, z30.s // Subtract 128.f + + // z16-z23: convert the FP32 values from the tmp tensor to int32. + fcvtzs z16.s, p0/m, z16.s + fcvtzs z17.s, p0/m, z17.s + fcvtzs z18.s, p0/m, z18.s + fcvtzs z19.s, p0/m, z19.s + fcvtzs z20.s, p0/m, z20.s + fcvtzs z21.s, p0/m, z21.s + fcvtzs z22.s, p0/m, z22.s + fcvtzs z23.s, p0/m, z23.s + + // z16-z17: narrow the int32 values into int8 and saturate them. + .inst 0xc133e210 // sqcvt z16.b, { z16.s - z19.s } + .inst 0xc133e291 // sqcvt z17.b, { z20.s - z23.s } + + // Juggling the value to z20 (resp. 21) as z25 (resp. z30) will be overwritten by the load below. + dup z20.s, z25.s[0] + dup z21.s, z30.s[0] + + .inst 0xa001c7b8 // ld1w { z24.s - z27.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + .inst 0xa001c7bc // ld1w { z28.s - z31.s }, pn9/z, [x29, x1, lsl #2] + add x1, x1, #64 + + // z24-z31: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z24.s, z20.s, z24.s + fmul z25.s, z20.s, z25.s + fmul z26.s, z20.s, z26.s + fmul z27.s, z20.s, z27.s + fmul z28.s, z20.s, z28.s + fmul z29.s, z20.s, z29.s + fmul z30.s, z20.s, z30.s + fmul z31.s, z20.s, z31.s + + // z24-z31: subtract 128.f. + fsub z24.s, z24.s, z21.s + fsub z25.s, z25.s, z21.s + fsub z26.s, z26.s, z21.s + fsub z27.s, z27.s, z21.s + fsub z28.s, z28.s, z21.s + fsub z29.s, z29.s, z21.s + fsub z30.s, z30.s, z21.s + fsub z31.s, z31.s, z21.s + + // z24-z31: convert the FP32 values from the tmp tensor to int32. + fcvtzs z24.s, p0/m, z24.s + fcvtzs z25.s, p0/m, z25.s + fcvtzs z26.s, p0/m, z26.s + fcvtzs z27.s, p0/m, z27.s + fcvtzs z28.s, p0/m, z28.s + fcvtzs z29.s, p0/m, z29.s + fcvtzs z30.s, p0/m, z30.s + fcvtzs z31.s, p0/m, z31.s + + // z18-z19: narrow the int32 values into int8 and saturate them. + .inst 0xc133e312 // sqcvt z18.b, { z24.s - z27.s } + .inst 0xc133e393 // sqcvt z19.b, { z28.s - z31.s } + + .inst 0xa0228390 // st1b { z16.b - z19.b }, pn8, [x28, x2] + + // Juggling the values back to z25 (resp. z30) as z20 (resp. z21) will be overwritten by the next iteration or z25 (resp. z30) will be used below. + dup z25.s, z20.s[0] + dup z30.s, z21.s[0] +b normalize_body_start%= +normalize_body_end%=: +normalize_leftover_start%=: + whilelo p1.b, x1, %x[length] + b.none normalize_leftover_end%= + + // p2-p5 are - together - the 32-bit version of p1, the instructions below unpack p1 into those four predicate registers to allow for the 32-bit loads below to be correctly predicated + punpklo p2.h, p1.b + punpkhi p4.h, p1.b + + punpkhi p3.h, p2.b + punpklo p2.h, p2.b + + punpkhi p5.h, p4.b + punpklo p4.h, p4.b + + mov x2, x1 // Preserve the index into x2 for the final store to dst. + + // z20-z23: load exp(-scale*beta*x) from the tmp tensor + ld1w z20.s, p2/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z21.s, p3/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z22.s, p4/z, [x29, x1, LSL #2] + add x1, x1, #16 + + ld1w z23.s, p5/z, [x29, x1, LSL #2] + add x1, x1, #16 + + // z20-z23: effectively divides exp(-scale*beta*x) by the sum of the exponentials for the current row and multiplies by 256. + fmul z20.s, z25.s, z20.s + fmul z21.s, z25.s, z21.s + fmul z22.s, z25.s, z22.s + fmul z23.s, z25.s, z23.s + + //z20-z23: Subtract 128.f. + fsub z20.s, z20.s, z30.s + fsub z21.s, z21.s, z30.s + fsub z22.s, z22.s, z30.s + fsub z23.s, z23.s, z30.s + + // z20-23: convert the FP32 values from the tmp tensor to int32. + fcvtzs z20.s, p0/m, z20.s + fcvtzs z21.s, p0/m, z21.s + fcvtzs z22.s, p0/m, z22.s + fcvtzs z23.s, p0/m, z23.s + + .inst 0xc133e293 // sqcvt z19.b, { z20.s - z23.s }, narrow the int32 values into int8 and saturate them into z19. + + st1b z19.b, p1, [x28, x2] + + b normalize_leftover_start%= +normalize_leftover_end%=: + // ================================================== + // 3D loop closing + // ================================================== + add x27, x27, %x[src_stride_1] + add x28, x28, %x[dst_stride_1] + b loop_1_start%= +loop_1_end%=: + + add x24, x24, %x[src_stride_2] + add x25, x25, %x[dst_stride_2] + b loop_2_start%= +loop_2_end%=: + + add x21, x21, %x[src_stride_3] + add x22, x22, %x[dst_stride_3] + b loop_3_start%= +loop_3_end%=: + .inst 0xd503467f // smstop + )" + : + : [src] "r"(src), [tmp] "r"(tmp), [dst] "r"(dst), [beta] "r"(beta), [lut] "r"(lut), // + [shape_1] "r"(shape[1]), [shape_2] "r"(shape[2]), [shape_3] "r"(shape[3]), // + [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), + [src_stride_3] "r"(src_strides[3]), // + [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), + [dst_stride_3] "r"(dst_strides[3]), // + [length] "r"(shape[0]) // + : "cc", "memory", // + "p0", "p1", "p2", "p3", "p4", // + "x2", "x9", "x13", // + "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x19", // + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", // + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", // + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", // + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" // + ); +} + +void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr) +{ + ARM_COMPUTE_UNUSED(axis); + + const auto *src_info = in->info(); + const auto *dst_info = out->info(); + + const auto &full_shape = dst_info->tensor_shape(); + const auto &src_strides = src_info->strides_in_bytes(); + const auto &dst_strides = dst_info->strides_in_bytes(); + Strides tmp_strides; + + tmp_strides[0] = src_strides[0] * 4; + tmp_strides[1] = src_strides[1] * 4; + tmp_strides[2] = src_strides[2] * 4; + tmp_strides[3] = src_strides[3] * 4; + + const uintptr_t k_shape[] = { + full_shape[0], + window.num_iterations(1), + window.num_iterations(2), + window.num_iterations(3), + }; + + const uintptr_t k_src_strides[] = { + src_strides[0], + src_strides[1], + src_strides[2], + src_strides[3], + }; + + const uintptr_t k_dst_strides[] = { + dst_strides[0], + dst_strides[1], + dst_strides[2], + dst_strides[3], + }; + + const uintptr_t k_src_offset = window[0].start() * src_strides[0] + // + window[1].start() * src_strides[1] + // + window[2].start() * src_strides[2] + // + window[3].start() * src_strides[3]; + + const uintptr_t k_dst_offset = window[0].start() * dst_strides[0] + // + window[1].start() * dst_strides[1] + // + window[2].start() * dst_strides[2] + // + window[3].start() * dst_strides[3]; + + const uintptr_t k_tmp_offset = window[0].start() * tmp_strides[0] + // + window[1].start() * tmp_strides[1] + // + window[2].start() * tmp_strides[2] + // + window[3].start() * tmp_strides[3]; + + const auto *k_src = reinterpret_cast<const int8_t *>(in->buffer() + k_src_offset); + float *tmp_float_ptr = reinterpret_cast<float *>(tmp); + auto *k_tmp = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset); + auto *k_dst = reinterpret_cast<int8_t *>(out->buffer() + k_dst_offset); + + sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SME2 diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp new file mode 100644 index 0000000000..0d4b7f4509 --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/softmax/generic/sve/impl.h" + +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 +template <typename ScalarType> +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win{window}; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + + auto max_val = svmaxv(all_true_pg, vec_max); + + *out_ptr = max_val; + }, + input, output); +} + +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); + + ScalarType sum{0}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta)); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); + if (!is_log) + { + vec_elements = wrapper::svexp_z(pg, vec_elements); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + if (is_log) + { + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if (is_log) + { + sum = static_cast<ScalarType>(std::log(sum)); + } + else + { + sum = ScalarType(1) / sum; + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); + if (is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h new file mode 100644 index 0000000000..89a30d042f --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sve/impl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H +#define SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H + +#include "arm_compute/core/Helpers.h" +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); + +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_IMPL_H */ diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp new file mode 100644 index 0000000000..a8fb1d4adf --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/softmax/generic/sve2/impl.h" + +#include "arm_compute/core/Types.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +/// TODO: (COMPMID-6505) Similar to Neon(TM), this implementation be converted to +/// a single kernel that performs softmax operation. Leaving the SVE2 code here for +/// future references. Implementation for Neon(TM) is introduced in COMPMID-6500 +template <typename ScalarType> +void sve2_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const auto scale_beta_vec = svdup_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + using SVEType = typename wrapper::traits::sve_vector<ScalarType>::type; + + const int inc_1 = static_cast<int>(svcntw()); + const int inc_2 = static_cast<int>(2 * svcntw()); + const int inc_3 = static_cast<int>(3 * svcntw()); + + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); + + float sum{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + const auto vec_elements = svld1(pg, in_ptr + x); + const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); + + if (is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } + + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), + svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if (is_log) + { + sum = std::log(sum); + } + else + { + sum = 256.f / sum; + } + } + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; + + if (is_log) + { + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if (is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, res_0, offset_vec); + res_1 = svsub_z(pg_1, res_1, offset_vec); + res_2 = svsub_z(pg_2, res_2, offset_vec); + res_3 = svsub_z(pg_3, res_3, offset_vec); + } + } + + // Store value + const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h new file mode 100644 index 0000000000..33fcc26cda --- /dev/null +++ b/src/cpu/kernels/softmax/generic/sve2/impl.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H +#define SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H + +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */ diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h new file mode 100644 index 0000000000..7bbb265022 --- /dev/null +++ b/src/cpu/kernels/softmax/list.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H +#define ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + template <bool IS_LOG> \ + void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, \ + const float *lut_ptr) + +DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); +DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); +DECLARE_SOFTMAX_KERNEL(neon_qasymm8_softmax); +DECLARE_SOFTMAX_KERNEL(neon_qasymm8_signed_softmax); + +#ifdef ARM_COMPUTE_ENABLE_SME2 + +void sme2_fp32_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +void sme2_fp16_softmax(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +void sme2_qasymm8_softmax_lut_512VL(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in, + void *const tmp, + ITensor *out, + const float beta, + int axis, + const Window &window, + const float *lut_ptr); + +#endif // ARM_COMPUTE_ENABLE_SME2 + +#undef DECLARE_SOFTMAX_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_SOFTMAX_LIST_H diff --git a/src/cpu/kernels/sub/neon/fp16.cpp b/src/cpu/kernels/sub/neon/fp16.cpp new file mode 100644 index 0000000000..023068817b --- /dev/null +++ b/src/cpu/kernels/sub/neon/fp16.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/sub/neon/impl.h" + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +void sub_same_neon_fp16( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + sub_same_neon<float16_t>(src0, src1, dst, policy, window); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/sub/neon/impl.h b/src/cpu/kernels/sub/neon/impl.h new file mode 100644 index 0000000000..6123f7e25a --- /dev/null +++ b/src/cpu/kernels/sub/neon/impl.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H +#define ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/NEON/wrapper/scalar/sub.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void sub_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + bool is_sat = policy == ConvertPolicy::SATURATE; + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) + : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + if (is_broadcast_input_2) + { + res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{})); + } + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = + is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; + if (is_broadcast_input_2) + { + res = static_cast<T>(-1) * res; + } + + *(output_ptr + x) = res; + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_IMPL_H diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h new file mode 100644 index 0000000000..f29571f122 --- /dev/null +++ b/src/cpu/kernels/sub/neon/list.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H +#define ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SUB_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) + +DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint); +DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint); +DECLARE_SUB_KERNEL(sub_qasymm8_neon); +DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon); +DECLARE_SUB_KERNEL(sub_qsymm16_neon); +DECLARE_SUB_KERNEL(sub_same_neon_fp16); + +#undef DECLARE_SUB_KERNEL +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_SUB_NEON_LIST_H diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp new file mode 100644 index 0000000000..b750afce6e --- /dev/null +++ b/src/cpu/kernels/sub/neon/qasymm8.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/); +} + +void sub_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..fb0bb62682 --- /dev/null +++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" + +#include "src/cpu/kernels/add/generic/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_signed_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/); +} + +void sub_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/); +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp new file mode 100644 index 0000000000..23e4b03843 --- /dev/null +++ b/src/cpu/kernels/sub/neon/qsymm16.cpp @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + + if (is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const float32x4x2_t bf = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + }}; + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const float32x4x2_t af = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + }}; + + const int32x4x4_t rf = {{ +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), +#endif //__aarch64__ + }}; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const float32x4x2_t af = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + }}; + + const float32x4x2_t bf = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), + }}; + + const int32x4x2_t rf = {{ +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#endif //__aarch64__ + }}; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp new file mode 100644 index 0000000000..44d70cf503 --- /dev/null +++ b/src/cpu/operators/CpuActivation.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuActivation.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/IOperator.h" +#include "src/common/utils/LegacySupport.h" +#include "src/common/utils/Log.h" +#include "src/cpu/CpuContext.h" +#include "src/cpu/kernels/CpuActivationKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_LOG_PARAMS(input, output, activation_info); + auto k = std::make_unique<kernels::CpuActivationKernel>(); + k->configure(input, output, activation_info); + _kernel = std::move(k); +} + +Status +CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) +{ + return kernels::CpuActivationKernel::validate(input, output, activation_info); +} + +void CpuActivation::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} + +std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) +{ + TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); + TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); + auto info = detail::convert_to_activation_info(act); + + if (is_validate && + !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + { + return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); + } + + auto act_op = std::make_unique<cpu::CpuActivation>(); + act_op->configure(&src_info, &dst_info, info); + + auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); + if (op == nullptr) + { + ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); + return std::make_tuple(nullptr, StatusCode::OutOfMemory); + } + op->set_internal_operator(std::move(act_op)); + + return std::make_tuple(op, StatusCode::Success); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h new file mode 100644 index 0000000000..ec442f92c8 --- /dev/null +++ b/src/cpu/operators/CpuActivation.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ACTIVATION_H +#define ARM_COMPUTE_CPU_ACTIVATION_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuActivationKernel */ +class CpuActivation : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] output Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer parameters. + */ + void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuActivation::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp new file mode 100644 index 0000000000..53cd7fa1b7 --- /dev/null +++ b/src/cpu/operators/CpuAdd.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuAdd.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuAddKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuAdd::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info); + auto k = std::make_unique<kernels::CpuAddKernel>(); + k->configure(src0, src1, dst, policy); + _kernel = std::move(k); +} + +Status CpuAdd::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuAddKernel::validate(src0, src1, dst, policy); +} + +void CpuAdd::run(ITensorPack &tensors) +{ + const auto split_dimension = static_cast<kernels::CpuAddKernel *>(_kernel.get())->get_split_dimension(); + + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h new file mode 100644 index 0000000000..5f60102de2 --- /dev/null +++ b/src/cpu/operators/CpuAdd.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ADD_H +#define ARM_COMPUTE_CPU_ADD_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuAddKernel */ +class CpuAdd : public ICpuOperator +{ +public: + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] policy Overflow policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + */ + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ADD_H */ diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp new file mode 100644 index 0000000000..2f19f2f842 --- /dev/null +++ b/src/cpu/operators/CpuAddMulAdd.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuAddMulAdd.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/kernels/CpuAddMulAddKernel.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuAddMulAdd::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + + auto k = std::make_unique<kernels::CpuAddMulAddKernel>(); + + const DataType data_type = input1->data_type(); + if (is_data_type_quantized(data_type)) + { + _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul); + _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add); + + k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, + act_info); + + // Save auxilary memory requirements after configuration + _aux_mem[DequantizedBnMul] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, + _dequantized_bn_mul.total_size()); + _aux_mem[DequantizedBnAdd] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, + _dequantized_bn_add.total_size()); + } + else + { + k->configure(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + } + + _kernel = std::move(k); +} + +Status CpuAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + const DataType data_type = input1->data_type(); + if (is_data_type_quantized(data_type)) + { + TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32); + TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32); + + ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add)); + + return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, + add_output, final_output, policy, act_info); + } + else + { + return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, + act_info); + } +} + +void CpuAddMulAdd::run(ITensorPack &tensors) +{ + const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type(); + + if (is_data_type_quantized(data_type)) + { + const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); + + CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, + true); + CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, + true); + + ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul}, + {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}}; + + ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add}, + {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}}; + + _dequantize_bn_mul.run(dequantize_mul_pack); + _dequantize_bn_add.run(dequantize_add_pack); + + ITensorPack add_mul_add_pack = { + {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)}, + {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)}, + {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()}, + {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()}, + {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)}, + {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)}, + }; + + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack); + } + else + { + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); + } +} + +experimental::MemoryRequirements CpuAddMulAdd::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h new file mode 100644 index 0000000000..47db75c37e --- /dev/null +++ b/src/cpu/operators/CpuAddMulAdd.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_OPERATORS_CPUADDMULADD +#define SRC_CPU_OPERATORS_CPUADDMULADD + +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuDequantize.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuAddMulAddKernel */ +class CpuAddMulAdd : public ICpuOperator +{ +public: + /** Initialize the operator's inputs and outputs. + * + * Similar to @ref NEAddMulAdd::configure() + * + */ + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuAddMulAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + + // We need auxilary memory to dequantize batchnorm coefficients + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + DequantizedBnMul = 0, + DequantizedBnAdd, + Count + }; + + CpuDequantize _dequantize_bn_mul{}; + CpuDequantize _dequantize_bn_add{}; + + TensorInfo _dequantized_bn_mul{}; + TensorInfo _dequantized_bn_add{}; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CPU_OPERATORS_CPUADDMULADD */ diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp new file mode 100644 index 0000000000..55b9204d71 --- /dev/null +++ b/src/cpu/operators/CpuCast.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuCast.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCastKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, policy); + auto k = std::make_unique<kernels::CpuCastKernel>(); + k->configure(src, dst, policy); + _kernel = std::move(k); +} + +Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + return kernels::CpuCastKernel::validate(src, dst, policy); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h new file mode 100644 index 0000000000..1f4da6e2a0 --- /dev/null +++ b/src/cpu/operators/CpuCast.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUCAST_H +#define ACL_SRC_CPU_OPERATORS_CPUCAST_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuCastKernel */ +class CpuCast : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * Input data type must be different than output data type. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:-----------------------------------------------| + * |QASYMM8_SIGNED | S16, S32, F32, F16 | + * |QASYMM8 | U16, S16, S32, F32, F16 | + * |U8 | U16, S16, S32, F32, F16 | + * |U16 | U8, U32 | + * |S16 | QASYMM8_SIGNED, U8, S32 | + * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | + * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | + * |F32 | QASYMM8_SIGNED, QASYMM8, F16, S32, U8| + * |S64 | F32 | + * + * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32. + * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + * + * + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCast::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUCAST_H diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp new file mode 100644 index 0000000000..5f517a8fcb --- /dev/null +++ b/src/cpu/operators/CpuConcatenate.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConcatenate.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_ERROR_ON(dst == nullptr); + ARM_COMPUTE_LOG_PARAMS(srcs_vector, dst, axis); + + _axis = axis; + _num_srcs = srcs_vector.size(); + + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis)); + + unsigned int offset = 0; + + for (unsigned int i = 0; i < _num_srcs; ++i) + { + switch (axis) + { + case Window::DimX: + { + auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case Window::DimY: + { + auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case Window::DimZ: + { + auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case 3: + { + auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + offset += srcs_vector.at(i)->dimension(axis); + } +} + +Status +CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); + + unsigned int offset = 0; + for (const auto &src : srcs_vector) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + switch (axis) + { + case Window::DimX: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst)); + break; + } + case Window::DimY: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst)); + break; + } + case Window::DimZ: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst)); + break; + } + case 3: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst)); + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + offset += src->dimension(axis); + } + + if (dst->total_size() != 0) + { + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); + ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +void CpuConcatenate::run(ITensorPack &tensors) +{ + if (tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) + { + ARM_COMPUTE_ERROR("Configured with different number of inputs"); + } + + int i = 0; + for (auto &k : _concat_kernels) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack); + ++i; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h new file mode 100644 index 0000000000..c36977c70f --- /dev/null +++ b/src/cpu/operators/CpuConcatenate.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_H +#define ARM_COMPUTE_CPU_CONCATENATE_H + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" + +#include <vector> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: + * + * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0). + * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1). + * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2). + * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3). + */ +class CpuConcatenate : public ICpuOperator +{ +public: + CpuConcatenate() = default; + /** Configure operator for a given list of arguments + * + * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel, + * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel. + * + * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenate::configure() + * + * @return a status + */ + static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{}; + unsigned int _num_srcs{0}; + unsigned int _axis{0}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */ diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp new file mode 100644 index 0000000000..26ca2ee783 --- /dev/null +++ b/src/cpu/operators/CpuConv2d.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConv2d.h" + +#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDirectConv2d.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmConv2d.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuConv2d::CpuConv2d() : _function() +{ +} + +CpuConv2d::~CpuConv2d() = default; + +void CpuConv2d::configure(ITensorInfo *input, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_UNUSED(num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) + { + case ConvolutionMethod::WINOGRAD: + { + auto f = std::make_unique<CpuWinogradConv2d>(); + f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); + _function = std::move(f); + break; + } + case ConvolutionMethod::GEMM: + { + auto f = std::make_unique<CpuGemmConv2d>(); + f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math); + _function = std::move(f); + break; + } + case ConvolutionMethod::GEMM_CONV2D: + { + auto f = std::make_unique<CpuGemmDirectConv2d>(); + f->configure(input, weights, biases, output, info); + _function = std::move(f); + break; + } + case ConvolutionMethod::DIRECT: + { + auto f = std::make_unique<CpuDirectConv2d>(); + f->configure(input, weights, biases, output, conv_info, act_info); + _function = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + _aux_mem = _function->workspace(); +} + +Status CpuConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) + { + case ConvolutionMethod::WINOGRAD: + ARM_COMPUTE_RETURN_ON_ERROR( + CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); + break; + case ConvolutionMethod::GEMM: + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math)); + break; + case ConvolutionMethod::GEMM_CONV2D: + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); + break; + case ConvolutionMethod::DIRECT: + ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info)); + break; + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); + ARM_COMPUTE_UNUSED(weights_info); + + const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; + using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; + + const std::vector<ConfigurationMethod> known_configs = { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U)), + ConvolutionMethod::GEMM), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod( + ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod( + ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM)}; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + + return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); + }; + + std::vector<ConfigurationMethod>::const_iterator found; + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if (dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + const bool gemmDirectConv2d_validates = + bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)); + + // SRGAN + // Output might not be initialized when it is an internal tensor of the layer using the convolution + if (input->total_size() > 1e7 && weights->dimension(idx_h) > 7) + { + // This configuration is memory demanding for GEMM method. GEMM_CONV2D which uses indirect convolution + // kernels underneath is the best option. + if (gemmDirectConv2d_validates) + { + return ConvolutionMethod::GEMM_CONV2D; + } + else if (bool(CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + { + // NCHW data layout is not supported by GEMM_CONV2D + return ConvolutionMethod::DIRECT; + } + } + if (input->dimension(idx_c) < 16) + { + return ConvolutionMethod::GEMM; + } + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + // This heuristics only applies to F16 data type on A55r1 + if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && + input->data_type() == DataType::F16) + { + // Exclude known bad winograd configs (and defaults to GEMM) + const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = { + // Squeezenet_V1_1 fire2 and fire3 + ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + // Squeezenet_V1_1 fire6 and fire7 + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), + PadStrideInfo(1U, 1U, 1U, 1U)), + // Squeezenet_V1_1 fire8 and fire9 + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), + PadStrideInfo(1U, 1U, 1U, 1U)), + }; + const auto find_conv_config = [&](ConvolutionConfiguration c) + { + const PadStrideInfo info = std::get<3>(c); + + return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); + }; + + bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), + known_bad_winograd_f16_with_fastmath_configs.end(), + find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end(); + if (found_bad) + { + return ConvolutionMethod::GEMM; + } + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + + // For 1x1 convolutions run the default GEMM + if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + { + return ConvolutionMethod::GEMM; + } + + if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + { + return ConvolutionMethod::WINOGRAD; + } + if (gemmDirectConv2d_validates) + { + return ConvolutionMethod::GEMM_CONV2D; + } + return ConvolutionMethod::GEMM; + } +} + +void CpuConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + _function->run(tensors); +} + +void CpuConv2d::prepare(ITensorPack &tensors) +{ + _function->prepare(tensors); +} + +experimental::MemoryRequirements CpuConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h new file mode 100644 index 0000000000..71b9e15dc1 --- /dev/null +++ b/src/cpu/operators/CpuConv2d.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to simulate a convolution layer. This function calls one of the following functions: + * -# @ref CpuGemm (executed only in case GEMM is required for the operation) + * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation) + * -# @ref CpuDirectConv2d (executed only in case Direct Convolution is required for the operation) + * + * + * The function selects one of the algorithms mentioned above based on: + * - The size of the kernel + * - Number of input/output feature maps + * - Amount of memory needed + * + * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed. + * + * FP32 Algorithm| Filter Size | Input/Output feature maps | + * --------------|----------------------------------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 | + * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd 5x5 requires fast maths enabled. + * + * FP16 Algorithm| Filter Size | + * --------------|------------------| + * Winograd | Not supported | + * FFT | Not supported | + * DirectConv | 9x9 | + * GEMM | Any size | + * + * + */ +class CpuConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d); + /** Default destructor */ + ~CpuConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + */ + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d + * + * Similar to CpuConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will return the convolution called by @ref CpuConv2d + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * + * @return the Convolution Method Hint + */ + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<ICpuOperator> _function; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp new file mode 100644 index 0000000000..49e31926e3 --- /dev/null +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); + auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>(); + k->configure(src, dst, original_src_shape, data_layout); + _kernel = std::move(k); +} + +Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); +} + +void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) +{ + NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h new file mode 100644 index 0000000000..e208cca3a0 --- /dev/null +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H +#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */ +class CpuConvertFullyConnectedWeights : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void + configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConvertFullyConnectedWeights::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */ diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp new file mode 100644 index 0000000000..92c19d4df2 --- /dev/null +++ b/src/cpu/operators/CpuCopy.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuCopy.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuCopyKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuCopyKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuCopy.h b/src/cpu/operators/CpuCopy.h new file mode 100644 index 0000000000..9ffde4e781 --- /dev/null +++ b/src/cpu/operators/CpuCopy.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COPY_H +#define ARM_COMPUTE_CPU_COPY_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuCopyKernel */ +class CpuCopy : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination info. Data type supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCopy::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_COPY_H */ diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..54075f2afa --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDepthwiseConv2d.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/InfoHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +Status validate_arguments_optimized(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + if (!is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > + src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > + src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if (biases != nullptr) + { + const unsigned int channel_idx = + get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); + } + + ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); + + // Validate Activation Layer + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + return Status{}; +} +} // namespace + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); + + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _has_bias = biases != nullptr; + _is_nchw = src->data_layout() == DataLayout::NCHW; + _permute = _is_nchw; + _is_prepared = false; + _are_weights_const = weights->are_values_constant(); + + // Configure pipeline + _is_activationlayer_enabled = + info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); + + _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>(); + if (_is_nchw) + { + _permute_input = std::make_unique<cpu::CpuPermute>(); + _permute_weights = std::make_unique<cpu::CpuPermute>(); + _permute_output = std::make_unique<cpu::CpuPermute>(); + + auto input_perm = std::make_unique<TensorInfo>(); + auto weights_perm = std::make_unique<TensorInfo>(); + auto output_perm = std::make_unique<TensorInfo>(); + + // Configure the function to transform the input tensor from NCHW -> NHWC + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + + // Configure the function to transform the weights tensor from IHW -> HWI + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + + output_perm->set_data_layout(DataLayout::NHWC); + output_perm->set_quantization_info(dst->quantization_info()); + + // Configure optimized depthwise + _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); + + // Configure the function to transform the convoluted output to ACL's native ordering format NCHW + output_perm->set_data_layout(DataLayout::NHWC); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + } + else + { + _dwc_optimized_func->configure(src, weights, biases, dst, info); + } + + // Configure activation + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<cpu::CpuActivation>(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + return validate_arguments_optimized(src, weights, biases, dst, info); +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute input + if (_permute) + { + ITensorPack pack; + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + } + + // Run assembly function + if (_is_nchw) + { + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src_perm); + pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst_perm); + _dwc_optimized_func->run(pack); + } + else + { + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src); + pack.add_tensor(TensorType::ACL_SRC_1, weights); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst); + _dwc_optimized_func->run(pack); + } + + // Permute output + if (_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + // Run activation + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors) +{ + // if weights are not constant then we need to repack so that weights + // can be updated in-place + if (!_are_weights_const) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + ITensorPack pack_opt; + pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + + return; + } + + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute weights + if (_permute) + { + auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, permuted_weights); + _permute_weights->run(pack); + + weights->mark_as_unused(); + + ITensorPack pack_opt; + pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + else + { + ITensorPack pack_opt; + pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + + _is_prepared = true; + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); + + _is_nchw = src->data_layout() == DataLayout::NCHW; + _is_prepared = !_is_nchw; + + ITensorInfo *input_to_use = src; + const ITensorInfo *weights_to_use = weights; + ITensorInfo *output_to_use = dst; + + auto input_perm = std::make_unique<TensorInfo>(); + auto weights_perm = std::make_unique<TensorInfo>(); + auto output_perm = std::make_unique<TensorInfo>( + dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + + if (_is_nchw) + { + _permute_input = std::make_unique<cpu::CpuPermute>(); + _permute_weights = std::make_unique<cpu::CpuPermute>(); + + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + input_to_use = input_perm.get(); + + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + weights_to_use = weights_perm.get(); + + output_to_use = output_perm.get(); + } + + _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); + _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); + + if (_is_nchw) + { + _permute_output = std::make_unique<cpu::CpuPermute>(); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + output_perm->set_data_layout(DataLayout::NHWC); + } + + //Configure Activation Layer + _is_activationlayer_enabled = info.act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<cpu::CpuActivation>(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + if (src->data_layout() == DataLayout::NCHW) + { + TensorShape permuted_input_shape = src->tensor_shape(); + TensorShape permuted_weights_shape = weights->tensor_shape(); + TensorShape permuted_output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); + + const TensorInfo permuted_input = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NCHW)); + + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); + + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + } + + // Validate Activation Layer + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + + return Status{}; +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + + if (_is_nchw) + { + prepare(tensors); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + + ITensorPack pack_depth; + pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); + pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); + } + else + { + ITensorPack pack_depth; + pack_depth.add_tensor(TensorType::ACL_SRC_0, src); + pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); + } + + if (_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + + ARM_COMPUTE_ERROR_ON(!weights->is_used()); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, weights_perm); + + _permute_weights->run(pack); + weights->mark_as_unused(); + _is_prepared = true; + } +} + +void CpuDepthwiseConv2d::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); + + _depth_conv_func = + get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.configure(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.configure(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); + switch (depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + { + return DepthwiseConvolutionFunction::OPTIMIZED; + } + else + { + return DepthwiseConvolutionFunction::GENERIC; + } +} + +void CpuDepthwiseConv2d::run(ITensorPack &tensors) +{ + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.run(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.run(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} + +void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) +{ + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.prepare(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.prepare(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h new file mode 100644 index 0000000000..7eaa0df857 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2d.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" +#include "src/cpu/operators/CpuPermute.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to execute a depthwise convolution. + */ +class CpuDepthwiseConv2d : public ICpuOperator +{ +public: + /** Default constructor */ + CpuDepthwiseConv2d() = default; + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + * + * @return a Depthwise Convolution Function + */ + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + +private: + /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: + * + * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported + * + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present + * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present + * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present + * -# @ref CpuActivation if fused activation is required + * + */ + class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dOptimizedInternal() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dOptimizedInternal() = default; + /** Initialize the function's source, destination, kernels and border_size. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _has_bias{false}; + bool _is_quantized{false}; + bool _is_nchw{true}; + bool _permute{false}; + bool _is_activationlayer_enabled{false}; + bool _is_prepared{false}; + bool _are_weights_const{true}; + }; + + /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: + * + * -# @ref CpuDepthwiseConv2dNativeKernel + * + */ + class CpuDepthwiseConv2dGeneric : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dGeneric() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dGeneric() = default; + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dGeneric::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _is_nchw{true}; + bool _is_prepared{false}; + bool _is_activationlayer_enabled{false}; + }; + + DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC}; + CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; + CpuDepthwiseConv2dGeneric _func_generic{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */ diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp new file mode 100644 index 0000000000..7fe9011da1 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2019-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl +{ + std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr}; + bool is_prepared{false}; + bool are_weights_const{true}; + experimental::MemoryRequirements mem_req{}; +}; + +#ifndef DOXYGEN_SKIP_THIS +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>()) +{ +} +#endif /* DOXYGEN_SKIP_THIS */ + +CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default; + +void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + _pImpl->is_prepared = false; + _pImpl->are_weights_const = weights->are_values_constant(); + + // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + { + return; + } + + auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>(); + ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr); + dwc_wrapper->configure(src, weights, bias, dst, info, ci); + + // Compute memory requirements for assembly kernels + constexpr size_t alignment = 4096; + _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment}); + _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment}); + _pImpl->asm_kernel = std::move(dwc_wrapper); +} + +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); +} + +experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const +{ + return _pImpl->mem_req; +} + +bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) +{ + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); + return act.type != arm_gemm::Activation::Type::None; +} + +void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + prepare(tensors); + + // Split over rows (z) if there's more than 1, otherwise batches (w). This logic + // corresponds to the threading strategy in DepthFirstDriver::execute_internal + auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) != 1 ? Window::DimZ : Window::DimW; + + NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors); +} + +void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) +{ + const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + + if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) + { + // Pack weights and bias + const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); + + const auto weights_ptr = weights->buffer() + weights->info()->offset_first_element_in_bytes(); + const auto bias_ptr = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr; + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); + + const auto weights_shape = weights->info()->tensor_shape(); + const auto weights_padding = weights->info()->padding(); + + const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; + const size_t ld_weights_row = + ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); + + weights->mark_as_unused(); + if (bias != nullptr) + { + bias->mark_as_unused(); + } + _pImpl->is_prepared = true; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h new file mode 100644 index 0000000000..f1816625d2 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +struct ConvolutionInfo; + +namespace cpu +{ +/** Depthwise convolution assembly kernel glue */ +class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator +{ +public: + CpuDepthwiseConv2dAssemblyDispatch(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); + ~CpuDepthwiseConv2dAssemblyDispatch(); + /** Initialize the function's source, destination, kernels and border_size. + * + * @note Supports only NHWC format + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: same as @p src or S32 if @p src is quantized. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); + /** Checks if activation is supported by the assembly kernels + * + * @param[in] activation Activation to check + * + * @return True if activation is supported else false + */ + static bool is_activation_supported(const ActivationLayerInfo &activation); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + struct LocalImpl; + std::unique_ptr<LocalImpl> _pImpl; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */ diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp new file mode 100644 index 0000000000..c05a23f3a7 --- /dev/null +++ b/src/cpu/operators/CpuDequantize.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDequantize.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuDequantizeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuDequantizeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuDequantizeKernel::validate(src, dst); +} + +void CpuDequantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDequantize.h b/src/cpu/operators/CpuDequantize.h new file mode 100644 index 0000000000..dbfc0c612a --- /dev/null +++ b/src/cpu/operators/CpuDequantize.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H +#define ARM_COMPUTE_CPU_DEQUANTIZE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */ +class CpuDequantize : public ICpuOperator +{ +public: + /** Configure the kernel. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuDequantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */ diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp new file mode 100644 index 0000000000..135a3bb2b9 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv2d.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDirectConv2d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConv2d::~CpuDirectConv2d() = default; + +CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), + _output_stage_kernel(), + _conv_kernel(), + _input_border_handler(), + _activationlayer_function(), + _accumulator(), + _has_bias(false), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ), + _is_padding_required() +{ +} + +void CpuDirectConv2d::configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info); + + _output_stage_kernel = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>(); + _conv_kernel = std::make_unique<kernels::CpuDirectConv2dKernel>(); + _input_border_handler = std::make_unique<NEFillBorderKernel>(); + + // Free accumulator + if (_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; + + // Check if bias should be added in the convolution result + _has_bias = (bias != nullptr); + + _conv_kernel->configure(src, weights, dst, conv_info); + if (_has_bias) + { + _output_stage_kernel->configure(dst, bias); + } + _is_padding_required = !_conv_kernel->border_size().empty(); + + if (_is_padding_required) + { + // Add zero padding XY + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, + PixelValue(static_cast<float>(0.f))); + } + + //Configure Activation Layer + _is_activationlayer_enabled = act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<CpuActivation>(); + _activationlayer_function->configure(dst, dst, act_info); + } +} + +Status CpuDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + + // output might not be initialized since it can be an intermediate tensor of another layer + DataType data_type = src->data_type(); + TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); + + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), + "Biases size and number of input feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // Validate bias kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); + + if (act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); + } + + return Status{}; +} + +void CpuDirectConv2d::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if (_is_padding_required) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_DST, src); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), + pack); + } + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + if (_has_bias) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, dst); + pack.add_tensor(TensorType::ACL_SRC_1, bias); + pack.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); + } + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h new file mode 100644 index 0000000000..73c85f2dcd --- /dev/null +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H +#define ARM_COMPUTE_CPU_DIRECTCONV2D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDirectConv2dKernel.h" +#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" +#include "src/cpu/operators/CpuActivation.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref NEFillBorderKernel for the input + * -# @ref kernels::CpuDirectConv2dOutputStageKernel + * -# @ref kernels::CpuDirectConv2dKernel + */ +class CpuDirectConv2d : public ICpuOperator +{ +public: + CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + ~CpuDirectConv2d(); + /** Set the input, weights, biases and output tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in, out] src Input tensor info. Data types supported: F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p src. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel; + std::unique_ptr<kernels::CpuDirectConv2dKernel> _conv_kernel; + std::unique_ptr<NEFillBorderKernel> _input_border_handler; + std::unique_ptr<CpuActivation> _activationlayer_function; + Tensor _accumulator; + bool _has_bias{false}; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; + bool _is_padding_required{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp new file mode 100644 index 0000000000..626f1c6775 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv3d.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDirectConv3d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConv3d::~CpuDirectConv3d() = default; + +CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), + _conv_kernel(), + _activationlayer_function(), + _accumulator(), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ) +{ +} + +void CpuDirectConv3d::configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info); + ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); + + _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>(); + + // Free accumulator + if (_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = Window::DimY; + + _conv_kernel->configure(src0, src1, src2, dst, conv_info); + + //Configure Activation Layer + _is_activationlayer_enabled = conv_info.act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<CpuActivation>(); + _activationlayer_function->configure(dst, dst, conv_info.act_info); + } +} + +Status CpuDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info)); + + if (conv_info.act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info)); + } + + return Status{}; +} + +void CpuDirectConv3d::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h new file mode 100644 index 0000000000..3ad1e09a14 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H +#define ARM_COMPUTE_CPU_DIRECTCONV3D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDirectConv3dKernel.h" +#include "src/cpu/operators/CpuActivation.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref kernels::CpuDirectConv3dKernel + */ +class CpuDirectConv3d : public ICpuOperator +{ +public: + CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + ~CpuDirectConv3d(); + /** Set the input, weights, biases and output tensor info. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in, out] src0 Input tensor info. + * @param[in] src1 Set of kernels to convolve the input volume. + * The 2nd dimension must be the same as the src0's volume 1st dimension. + * @param[in] src2 Set of biases. Can be nullptr. + * @param[out] dst Output tensor info. + * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. + * @param[in] conv_info Contains padding, stride, acitvation information. + */ + void configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel; + std::unique_ptr<CpuActivation> _activationlayer_function; + Tensor _accumulator; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONV3D_H */ diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp new file mode 100644 index 0000000000..c2ae8773c6 --- /dev/null +++ b/src/cpu/operators/CpuElementwise.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuElementwise.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/CpuElementwiseKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuElementwiseBase::run(ITensorPack &tensors) +{ + // If the kernel has been configured, use the window from the kernel. + if (_kernel->is_window_configured()) + { + ICpuOperator::run(tensors); + return; + } + + auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info(); + auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info(); + auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape()); + ICpuOperator::run(tensors, shape_and_window.second); +} + +template <ArithmeticOperation op> +void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuArithmeticKernel>(); + k->configure(op, src0, src1, dst); + _kernel = std::move(k); +} + +template <ArithmeticOperation op> +Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst); +} + +template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>; +template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>; +template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; +template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; + +void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuDivisionKernel>(); + k->configure(src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuDivisionKernel::validate(src0, src1, dst); +} + +void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuPowerKernel>(); + k->configure(src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuPowerKernel::validate(src0, src1, dst); +} + +template <ComparisonOperation COP> +void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuComparisonKernel>(); + k->configure(COP, src0, src1, dst); + _kernel = std::move(k); +} + +template <ComparisonOperation COP> +Status +CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); +} + +void CpuElementwiseComparison::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ComparisonOperation op) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuComparisonKernel>(); + k->configure(op, src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseComparison::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ComparisonOperation op) +{ + return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); +} + +// Supported Specializations +template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h new file mode 100644 index 0000000000..5db53c8026 --- /dev/null +++ b/src/cpu/operators/CpuElementwise.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuElementwiseBase : public ICpuOperator +{ +public: + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power + * + * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32 + * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32. + */ +template <ArithmeticOperation op> +class CpuElementwiseArithmetic : public CpuElementwiseBase +{ +public: + /** Configure the operator + * + * @param[in] src0 The first source tensor information. + * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor. + * @param[out] dst The output tensor information. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseArithmetic::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */ +using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */ +using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */ +using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; + +/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division + * + * @note The tensor data type for the inputs must be S32/F16/F32. + * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i]) + */ +class CpuElementwiseDivision : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseDivision::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) + * @note For an exponent that is a float, this function will only work with a positive base. + */ +class CpuElementwisePower : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: F16/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwisePower::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run @ref cpu::kernels::CpuComparisonKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @note The function performs a comparison operation between two tensors. + */ +class CpuElementwiseComparison : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U16/U32. + * @param[in] op Comparison Operation to be performed. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseComparison::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); +}; + +/** Basic function to run @ref cpu::kernels::CpuComparisonKernel + * + * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @note The function performs a comparison operation between two tensors. + */ +template <ComparisonOperation op> +class CpuElementwiseComparisonStatic : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U16/U32. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseComparisonStatic::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run equal comparison. */ +using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; +/** Basic function to run not equal comparison. */ +using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; +/** Basic function to run greater comparison. */ +using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; +/** Basic function to run greater-equal comparison. */ +using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; +/** Basic function to run less comparison. */ +using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>; +/** Basic function to run less-equal comparison. */ +using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp new file mode 100644 index 0000000000..04ab7bf8f5 --- /dev/null +++ b/src/cpu/operators/CpuElementwiseUnary.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuElementwiseUnary.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +using KernelType = kernels::CpuElementwiseUnaryKernel; + +void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) +{ + ARM_COMPUTE_LOG_PARAMS(op, src, dst); + auto k = std::make_unique<KernelType>(); + k->configure(op, src, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) +{ + return KernelType::validate(op, src, dst); +} + +void CpuElementwiseUnary::run(ITensorPack &tensors) +{ + if (_kernel->is_window_configured()) + { + ICpuOperator::run(tensors); + return; + } + + auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info(); + ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h new file mode 100644 index 0000000000..1e51bfaa1c --- /dev/null +++ b/src/cpu/operators/CpuElementwiseUnary.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H + +#include "arm_compute/core/Types.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuElementwiseUnary : public ICpuOperator +{ +public: + /** Initialize the function + * + * @param[in] op Unary operation to execute + * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. + * @param[out] dst Output tensor information. Data types supported: Same as @p src. + */ + void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseUnary::configure() + * + * @return a status + */ + static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; + +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp new file mode 100644 index 0000000000..1890d0b916 --- /dev/null +++ b/src/cpu/operators/CpuFill.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFill.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFillKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value) +{ + ARM_COMPUTE_LOG_PARAMS(tensor, constant_value); + auto k = std::make_unique<kernels::CpuFillKernel>(); + k->configure(tensor, constant_value); + _kernel = std::move(k); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h new file mode 100644 index 0000000000..cb83745d29 --- /dev/null +++ b/src/cpu/operators/CpuFill.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021,2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FILL_H +#define ARM_COMPUTE_CPU_FILL_H + +#include "arm_compute/core/PixelValue.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuFillKernel */ +class CpuFill : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in,out] tensor Tensor to fill. Supported data types: All + * @param[in] constant_value The value used to fill the planes of the tensor + */ + void configure(const ITensorInfo *tensor, PixelValue constant_value); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FILL_H */ diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp new file mode 100644 index 0000000000..2609d44590 --- /dev/null +++ b/src/cpu/operators/CpuFlatten.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFlatten.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuReshape.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuFlatten::CpuFlatten() : _reshape(nullptr) +{ +} + +CpuFlatten::~CpuFlatten() = default; + +void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + _reshape = std::make_unique<CpuReshape>(); + _reshape->configure(src, dst); +} + +Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return CpuReshape::validate(src, dst); +} + +void CpuFlatten::run(ITensorPack &tensors) +{ + _reshape->run(tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h new file mode 100644 index 0000000000..911760dd95 --- /dev/null +++ b/src/cpu/operators/CpuFlatten.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H +#define ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuReshape; +/** Basic function to flatten a given input */ +class CpuFlatten : public ICpuOperator +{ +public: + /** Constructor */ + CpuFlatten(); + /** Destructor */ + ~CpuFlatten(); + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * + * @param[in] src Source tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: All + * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. + * Data type supported: same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuFlatten::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<CpuReshape> _reshape; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp new file mode 100644 index 0000000000..a107393b01 --- /dev/null +++ b/src/cpu/operators/CpuFloor.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFloor.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFloorKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuFloorKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuFloorKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFloor.h b/src/cpu/operators/CpuFloor.h new file mode 100644 index 0000000000..6082f98867 --- /dev/null +++ b/src/cpu/operators/CpuFloor.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FLOOR_H +#define ARM_COMPUTE_CPU_FLOOR_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuFloorKernel */ +class CpuFloor : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[in] dst Destination tensor info. Data type supported: same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuFloor::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FLOOR_H */ diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp new file mode 100644 index 0000000000..85a0b0311b --- /dev/null +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFullyConnected.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" +#include "src/cpu/operators/CpuFlatten.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +{ + const auto data_type = src->data_type(); + const QuantizationInfo oq_info = dst->quantization_info(); + const UniformQuantizationInfo iq_unif = src->quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; + int32_t output_multiplier; + int32_t output_shift; + + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + int32_t type_min = 0; + int32_t type_max = 0; + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); + + gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage_info.gemmlowp_shift = output_shift; + gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; + gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage_info.gemmlowp_min_bound = type_min; + gemmlowp_output_stage_info.gemmlowp_max_bound = type_max; + + return Status{}; +} + +Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + bool enable_fast_math, + WeightFormat weight_format) +{ + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); + + GEMMLowpOutputStageInfo gemmlowp_output_stage_info; + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); + + GEMMInfo gemm_info; + gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); + gemm_info.set_fast_math(enable_fast_math); + + // Validate gemmlowp function + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info)); + } + else + { + GEMMInfo gemm_info; + gemm_info.set_weight_format(weight_format); + gemm_info.set_fixed_format(weight_format != WeightFormat::UNSPECIFIED); + gemm_info.set_fast_math(enable_fast_math); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, gemm_info)); + } + + return Status{}; +} +} // namespace + +CpuFullyConnected::CpuFullyConnected() + : _flatten(nullptr), + _convert_weights(nullptr), + _transpose_weights(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _flattened_src(), + _converted_weights(), + _reshaped_weights(), + _trans_weights(), + _trans_weights_idx(AuxTensorIdx::Count), + _aux_mem(Count), + _needs_weights_conversion(false), + _needs_weights_reshape(false), + _is_fc_after_conv(false), + _is_quantized_asymmetric(false), + _is_prepared(false), + _enable_fast_math(false), + _fixed_format(false), + _weight_format(arm_compute::WeightFormat::UNSPECIFIED), + _dynamic_weights(false) +{ +} + +CpuFullyConnected::~CpuFullyConnected() = default; + +void CpuFullyConnected::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + if (_is_quantized_asymmetric) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); + + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + + // Configure gemmlowp function and output stage for asymmetric quantized types + GEMMLowpOutputStageInfo gemmlowp_output_stage_info; + const Status status = + get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); + ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); + + GEMMInfo gemm_info; + gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); + gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); + _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info); + } + else + { + // Configure matrix multiply kernel + GEMMInfo gemm_info; + gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); + gemm_info.set_fixed_format(_fixed_format); + gemm_info.set_weight_format(_weight_format); + _mm_gemm = std::make_unique<CpuGemm>(); + _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info); + } +} + +void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the src tensor must be linearized + + // Initialize output tensor for flatten + auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src))); + + _flatten = std::make_unique<CpuFlatten>(); + _flatten->configure(src, &_flattened_src); + + // Configure matrix multiply kernel + configure_mm(&_flattened_src, weights, biases, dst, act); +} + +void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(src, weights, biases, dst, act); +} + +void CpuFullyConnected::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); + + _needs_weights_conversion = false; + _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights; + _is_fc_after_conv = true; + _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = false; + _trans_weights_idx = AuxTensorIdx::Count; + _enable_fast_math = fc_info.enable_fast_math; + _fixed_format = weights_info.weight_format() != WeightFormat::UNSPECIFIED; + _weight_format = weights_info.weight_format(); + _dynamic_weights = !weights->are_values_constant() && _needs_weights_reshape; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = src->num_dimensions() > 1; + } + + // Reshape weights if needed + if (_needs_weights_reshape) + { + // Reshape the weights + _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>(); + _transpose_weights->configure(weights, &_reshaped_weights); + _reshaped_weights.set_are_values_constant(weights->are_values_constant()); + + weights_to_use = &_reshaped_weights; + _trans_weights_idx = AuxTensorIdx::TransposedWeights; + } + + // Convert weights if needed + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>(); + _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(), + fc_info.weights_trained_layout); + _converted_weights.set_are_values_constant(weights_to_use->are_values_constant()); + + weights_to_use = &_converted_weights; + _needs_weights_conversion = true; + _trans_weights_idx = AuxTensorIdx::ConvertedWeights; + } + + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info); + } + + // Retain the tensorinfo with the weights to use + if (_needs_weights_reshape || _needs_weights_conversion) + { + _trans_weights = *weights_to_use; + } + + // Set auxiliary memory requirements + auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) + { + _aux_mem[i] = gemm_mem_req[i]; + } + + if (_aux_mem[Pretranspose].size > 0) + { + // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch + // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation + // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time. + _aux_mem[TransposedWeights] = MemoryInfo( + offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); + } + else + { + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : _needs_weights_conversion ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent, + _reshaped_weights.total_size()); + + _aux_mem[ConvertedWeights] = MemoryInfo( + offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, + _converted_weights.total_size()); + } + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); +} + +Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info) +{ + GEMMInfo gemm_info; + gemm_info.set_activation_info(fc_info.activation_info); + gemm_info.set_fast_math(fc_info.enable_fast_math); + gemm_info.set_fixed_format(weights_info.weight_format() != WeightFormat::UNSPECIFIED); + gemm_info.set_weight_format(weights_info.weight_format()); + + return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); +} + +Status CpuFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + + if (is_fixed_format_fast_math(weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(src, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(weights, DataType::BFLOAT16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); + } + + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_src = + TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped + ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *src_to_use = src; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + if (is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + } + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = src->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); + src_to_use = &flatten_src; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, + fc_info.enable_fast_math, weights_info.weight_format())); + + return Status{}; +} + +void CpuFullyConnected::run(ITensorPack &tensors) +{ + prepare(tensors); + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_run_count; + ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto src = tensors.get_const_tensor(ACL_SRC_0); + + CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); + + // Linearize src if it comes from a convolutional layer + if (_is_fc_after_conv) + { + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; + _flatten->run(flatten_pack); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if (_needs_weights_reshape || _needs_weights_conversion) + { + gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); + } + + // Run matrix multiply + if (_is_quantized_asymmetric) + { + _mm_gemmlowp->run(gemm_pack); + } + else + { + _mm_gemm->run(gemm_pack); + } +} + +void CpuFullyConnected::prepare(ITensorPack &tensors) +{ + if (!_is_prepared || _dynamic_weights) + { +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_prepare_count; + ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto weights = tensors.get_const_tensor(ACL_SRC_1); + + CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); + CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); + + // Pointer to current weights + const ITensor *cur_weights = weights; + + // Reshape of the weights (happens only once) + if (_needs_weights_reshape) + { + // Run reshape weights kernel and mark weights as unused + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), + transpose_pack); + + cur_weights->mark_as_unused(); + cur_weights = reshaped_weights.get(); + } + + // Convert weights if needed (happens only once) + if (_needs_weights_conversion) + { + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; + _convert_weights->run(convert_pack); + + cur_weights->mark_as_unused(); + cur_weights = converted_weights.get(); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized_asymmetric) + { + _mm_gemm->prepare(gemm_pack); + } + else + { + _mm_gemmlowp->prepare(gemm_pack); + } + + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuFullyConnected::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h new file mode 100644 index 0000000000..b72f77e5c4 --- /dev/null +++ b/src/cpu/operators/CpuFullyConnected.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H +#define ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/FullyConnectedLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +// Forward declarations +class CpuConvertFullyConnectedWeights; +class CpuFlatten; +class CpuGemm; +class CpuGemmLowpMatrixMultiplyCore; +namespace kernels +{ +class CpuTransposeKernel; +} // namespace kernels +/** Basic function to compute a Fully Connected layer. This function calls the following kernels: + * -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CpuFullyConnected : public ICpuOperator +{ +public: + /** Constructor */ + CpuFullyConnected(); + /** Destructor */ + ~CpuFullyConnected(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p src. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p src. + * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected + * + * Similar to @ref CpuFullyConnected::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + + /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format + * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same + * as in @ref CpuFullyConnectedLayer::validate() except that all arguments are required. + * + * @return a status + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info); + + //Inherited methods override + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + Pretranspose, + GemmTemp1, + GemmTemp2, + GemmTemp3, + GemmTemp4, + GemmTemp5, + GemmTemp6, + GemmTemp7, + GemmTemp8, + // Slots above (0-9) reserved for either CpuGemm or CpuGemmLowpMatrixMultiplyCore + TransposedWeights, + ConvertedWeights, + FlattenedSrc, + Count + }; + + std::unique_ptr<CpuFlatten> _flatten; + std::unique_ptr<CpuConvertFullyConnectedWeights> _convert_weights; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_weights; + std::unique_ptr<CpuGemm> _mm_gemm; + std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + + TensorInfo _flattened_src; + TensorInfo _converted_weights; + TensorInfo _reshaped_weights; + TensorInfo _trans_weights; + AuxTensorIdx _trans_weights_idx; + + experimental::MemoryRequirements _aux_mem; + + bool _needs_weights_conversion; + bool _needs_weights_reshape; + bool _is_fc_after_conv; + bool _is_quantized_asymmetric; + bool _is_prepared; + bool _enable_fast_math; + bool _fixed_format; + arm_compute::WeightFormat _weight_format; + bool _dynamic_weights; + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + int _asrt_run_count{}; + int _asrt_prepare_count{}; +#endif // ARM_COMPUTE_ASSERTS_ENABLED +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp new file mode 100644 index 0000000000..905e86c185 --- /dev/null +++ b/src/cpu/operators/CpuGemm.cpp @@ -0,0 +1,567 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemm.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.fast_mode = info.fast_math(); + asm_info.fixed_format = info.fixed_format(); + asm_info.weight_format = info.weight_format(); + asm_info.accumulate = info.accumulate(); + asm_info.transpose_b = + info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method + + return asm_info; +} +} // namespace + +void CpuGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info); + + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = beta == 1 && c != nullptr; + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + // Check if we need to reshape the matrix B only on the first run + _is_prepared = false; + _reshape_b_only_on_first_run = b->are_values_constant(); + _run_vector_matrix_multiplication = a->dimension(1) < 2; + _run_alpha_scale = alpha != 1.f; + _run_bias_addition = is_c_bias; + _run_addition = beta != 0 && beta != 1 && c != nullptr; + _run_activation = + gemm_info.activation_info().enabled() && + (!run_optimised || + (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + + if (run_optimised) + { + _run_interleave_transpose = false; + const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + _asm_glue->configure(a, b, c_to_use, d, asm_info); + ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); + + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + + // Scale product by alpha + if (_run_alpha_scale) + { + _alpha_scale_func = std::make_unique<cpu::CpuActivation>(); + _alpha_scale_func->configure( + d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + } + } + else + { + _run_interleave_transpose = !_run_vector_matrix_multiplication; + // Pick output tensor in case bias addition should be performed + ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d; + // Pick b tensor in case pretranspose should be performed + const ITensorInfo *b_to_use = b; + + _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); + + // Configure rhs pretranspose + if (gemm_info.pretranspose_B()) + { + _pretranspose_b_func = std::make_unique<CpuTranspose>(); + _pretranspose_b_func->configure(b_to_use, &_pretransposed_b); + MemoryLifetime lifetime; + if (_reshape_b_only_on_first_run) + { + if (_run_interleave_transpose) + { + // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS + // So PreTransposedRHS can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs + // So PreTransposedRHS needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PreTransposedRHS tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + _aux_mem[PreTransposedRHS] = + MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size()); + b_to_use = &_pretransposed_b; + } + + // Select between GEMV and GEMM + if (_run_vector_matrix_multiplication) + { + // Configure the matrix multiply kernel + _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false); + } + else + { + ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose); + // Configure interleave kernel + _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); + _interleave_kernel->configure(a, &_tmp_a); + _aux_mem[InterleavedLHS] = + MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); + + // Configure rhs transpose1xw kernel + _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); + _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b); + _aux_mem[Transposed1xWRHS] = + MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + + // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); + + // Configure matrix multiplication kernel + _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose, + GEMMReshapeInfo(m, n, k)); + } + + if (_run_bias_addition) + { + _add_bias = std::make_unique<cpu::CpuAdd>(); + _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); + _aux_mem[TempResult] = + MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); + } + } + + // Configure matrix addition kernel + if (_run_addition) + { + _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>(); + _ma_kernel->configure(c, d, beta); + } + + // Configure activation + if (_run_activation) + { + _activation_func = std::make_unique<cpu::CpuActivation>(); + _activation_func->configure(d, nullptr, gemm_info.activation_info()); + } +} + +Status CpuGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + // When using accumulation(in place summation), for now, the only supported values for alpha and beta are 1 respectively 0. + // Do the appropriate checks before proceeding. + if (gemm_info.accumulate()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(alpha != 1, "Accumulation is not supported when alpha is different from 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (beta != 0 && c != nullptr), + "Accumulation is not supported when beta is different from 0 with a non-null bias matrix c"); + } + + const bool is_c_bias = beta == 1 && c != nullptr; + const bool run_addition = c != nullptr && beta != 0 && beta != 1; + // Check if we should use the pretransposed_b or original b + // TODO: COMPMID-6597 + // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning + // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision + // We should simplify this by + // 1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch) + // 2. Moving this b_to_use check back into the non-optimized path + TensorInfo pretransposed_b = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b)); + const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b; + // TODO: COMPMID-6597 fix-start + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); + + if (is_fixed_format_fast_math(gemm_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use); + } + + const int block_by = arm_compute::block_by(gemm_info.weight_format()); + // test if im2col has changed the dimensions that are needed for padding + if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1) + { + // have to verify bias + const size_t dim0_sz = a->dimension(0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz % block_by) != 0, + ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); + // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right + // b_to_use->dimension(1) = kernel_area * input_channel + // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right + const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by; + const size_t kernel_area = (dim0_sz - b_to_use->dimension(1)) / input_pad_right; + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1), + "The product AB is defined only if A number of columns and B number of rows are related"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->dimension(0) != b_to_use->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + if (a->data_type() != DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); + } + + if (run_addition) + { + ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); + ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), + "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0), + "The C matrix must have the same number of columns as the matrix B"); + } + + if (d->total_size() != 0) + { + // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. + ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0)); + if (gemm_info.depth_output_gemm3d() != 0) + { + if (gemm_info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); + } + } + // TODO: COMPMID-6597 fix-end + + // Check if we need to run the optimized assembly kernel + cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + + // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag + // so we pass the original b to CpuGemmAssemblyDispatch + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), + "CpuGemm cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, + "CpuGemm cannot reinterpret the output tensor as 3D"); + + // Check if the first input tensor is a vector. + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + // Check if we need to reshape the matrix A and matrix B + const bool run_interleave_transpose = !run_vector_matrix_multiplication; + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo + // in order to know how the matrices have been reshaped + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo( + m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); + + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b_to_use; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo tmp_output_info = *d->clone(); + + if (run_interleave_transpose) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape( + *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); + + // Validate transpose kernel + auto_init_if_empty(tmp_b_info, + b_to_use->clone()->set_tensor_shape( + compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info)); + } + + // Validate matrix multiply + auto_init_if_empty(tmp_output_info, + matrix_a_info->clone()->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + + if (is_c_bias) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); + } + } + + // Validate matrix addition kernel + if (run_addition) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); + } + + // Validate activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if (activation.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); + } + + return Status{}; +} + +void CpuGemm::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto a = tensors.get_const_tensor(ACL_SRC_0); + auto b = tensors.get_const_tensor(ACL_SRC_1); + auto c = tensors.get_const_tensor(ACL_SRC_2); + auto d = tensors.get_tensor(ACL_DST); + + if (_asm_glue && _asm_glue->is_configured()) + { + // Pass c to asm dispatch only if it's the bias tensor + ITensorPack asm_pack = tensors; + asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr); + _asm_glue->run(asm_pack); + if (_run_alpha_scale) + { + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; + _alpha_scale_func->run(pack); + } + } + else + { + CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true); + CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true); + CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); + + ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; + + if (_run_interleave_transpose) + { + // Run interleave kernel + ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; + NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), + interleave_pack); + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); + } + + const ITensor *b_to_use = b; + if (_pretranspose_b_func) + { + if (!_reshape_b_only_on_first_run) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + } + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose1xw kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + b_to_use = transposed1xw_b.get(); + } + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_1, b_to_use); + + NEScheduler::get().schedule_op(_mm_kernel.get(), + _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, + _mm_kernel->window(), mm_pack); + + // Run bias addition kernel + if (_run_bias_addition) + { + ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}}; + _add_bias->run(pack); + } + } + + // Run matrix addition kernel + if (_run_addition) + { + ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}}; + NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); + } + + // Run activation function + if (_run_activation) + { + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; + _activation_func->run(pack); + } +} + +void CpuGemm::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + if (_asm_glue && _asm_glue->is_configured()) + { + _asm_glue->prepare(tensors); + } + else if (_reshape_b_only_on_first_run) + { + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *b_to_use = b; + CpuAuxTensorHandler pretransposed_b( + offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors, + false /*pack_inject: no need to inject into tensors*/, + _pretranspose_b_func == + nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, + false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/); + + if (_pretranspose_b_func) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + // Run transpose kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuGemm::workspace() const +{ + return _aux_mem; +} + +Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info) +{ + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + + return CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, asm_info); +} + +bool CpuGemm::isVarWeightsKernel() const +{ + return _asm_glue && _asm_glue->isVarWeightsKernel(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h new file mode 100644 index 0000000000..a05258d206 --- /dev/null +++ b/src/cpu/operators/CpuGemm.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMM_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMM_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/GEMMInfo.h" + +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" +#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" +#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuAdd.h" +#include "src/cpu/operators/CpuTranspose.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute GEMM. This function calls the following kernels: + * + * If optimized assembly is available: + * -# @ref cpu::CpuGemmAssemblyDispatch + * -# @ref cpu::CpuActivation (if alpha != 1.0) + * Else: + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel + * In both cases: + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) + * Else: + * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place) + * + * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo) + */ +class CpuGemm : public ICpuOperator +{ +public: + /** Default constructor */ + CpuGemm() = default; + /** Default destructor */ + ~CpuGemm() = default; + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |FP32 | + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. + * + * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around + * + * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a + * @param[out] d Output tensor info. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. + * + * Similar to @ref CpuGemm::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * This method has the same use of @ref + * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that + * the value of arm_compute::WeightFormat need to be passed via the + * parameter gemm_info. + */ + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + + /** Indicates if the convolution executes in variable weights mode. + * + * When ACL executes convolution in variable weights mode, it does + * not perform any processing of the weights tensor. Instead, it + * utilizes the data as it is given by the user. + */ + bool isVarWeightsKernel() const; + +private: + enum AuxTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + InterleavedLHS = 3, + PreTransposedRHS, + Transposed1xWRHS, + TempResult, + Count + }; + + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr}; + std::unique_ptr<CpuTranspose> _pretranspose_b_func{nullptr}; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose1xW_b_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr}; + std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr}; + std::unique_ptr<CpuAdd> _add_bias{nullptr}; + std::unique_ptr<CpuActivation> _activation_func{nullptr}; + + TensorInfo _tmp_a{}; + TensorInfo _pretransposed_b{}; + TensorInfo _tmp_b{}; + TensorInfo _tmp_d{}; + + bool _run_vector_matrix_multiplication{false}; + bool _run_interleave_transpose{ + true}; /**< If we run CpuGemmInterleave4x4Kernel on lhs and CpuGemmTranspose1xWKernel on rhs */ + bool _run_alpha_scale{false}; + bool _run_addition{false}; + bool _run_bias_addition{false}; + bool _run_activation{false}; + bool _reshape_b_only_on_first_run{false}; + bool _is_prepared{false}; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMM_H diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp new file mode 100644 index 0000000000..55d950ff4a --- /dev/null +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -0,0 +1,992 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmConv2d.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/Utils.h" +#include "src/cpu/kernels/CpuCol2ImKernel.h" +#include "src/cpu/kernels/CpuIm2ColKernel.h" +#include "src/cpu/kernels/CpuWeightsReshapeKernel.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" +#include "src/cpu/operators/CpuReshape.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +#include <set> +#include <tuple> + +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ + +/** @section note_CpuGemmConv2d_weight_transformation Weight Transformations in CpuGemmConv2d + * + * A. Terminology + * Throughout CpuGemmConv2d, we use the following terms in ways that may differ from other operators / kernels: + * - "Transform" or "Reshape" of the weights: they both mean all the operations that we perform on the weight + * tensor up until they are consumed by gemm (CpuGemm or CpuGemmLowpMatrixMultiplyCore) + * Note that the specific gemm operator may perform further transformations on the weights, but the + * transformations here only mean those performed in CpuGemmConv2d + * - "Transpose" of weights: The @ref CpuTranspose operation. I.e. transpose of the weights' lowest two + * dimensions + * + * B. Gemm-based conv2d + * We want to convert the 2d convolution op (ignoring bias): + * dst = conv2d(src, weight) + * into a matrix multiplication op: + * gemm_dst = gemm(lhs, rhs) + * + * E.g.: For data layout NHWC + * 3 (hi) <----------> (lo) 0 + * src.shape = [batch, in_h , in_w, in_c] + * weight.shape = [out_c, k_h , k_w, in_c] + * dst.shape = [batch, out_h, out_w, out_c] + * + * This requires three transformations: + * * src -> lhs, transform conv input to gemm lhs; gemm_lhs is a 2d matrix where each row (or column, + * depending on the convention) is a linearized "patch" of the conv_input that corresponds to + * the receptive field of the corresponding output element. + * The convention is to use "column", but to disambiguate from the column vector of a matrix, + * in this documentation we shall use "patch". + * This transform is called im2col (for details see @ref CpuIm2ColKernel) + * * weight -> rhs, transform conv weight to gemm rhs, known as weight transform/reshape (wt) + * * gemm_dst -> dst, transform gemm output back to conv output, known as col2im (for details see + * @ref CpuCol2ImKernel) + * + * This section focuses on the weight transformation and assumes the im2col is already performed + * + * C. Weight Transformation + * After im2col, assume: lhs.shape = [num_patch, patch_size], + * where patch_size is the number of elements in a "patch": patch_size = k_h * k_w * in_c + * num_patch is the number of patches; we can ignore it here (for details see @ref CpuIm2ColKernel) + * + * After wt, rhs should have the shape: rhs = [patch_size, out_c] + * + * Therefore, the weight transformation consists of two steps: + * 1. Collapsing all 3 spatial dimensions: [out_c, k_h, k_w, in_c] -> [out_c, patch_size] + * 2. Transpose the collapsed shape: [out_c, patch_size] -> [patch_size, out_c] + * + * D. Implementation + * There are 4 paths for weight transformation + * + * 1. Path 1: Fixed weight format - no transformation + * The underlying gemm kernel may adopt fixed weight format (isVarWeightsKernel() == true), which requires + * that no weight transformation shall be performed + * Note that this no-transform requirement applies both to this op (CpuGemmConv2d) and the constituent ops, up + * until the fixed format kernels themselves + * + * 2. Path 2: Reinterpret then transpose later + * If the weight tensor has no "holes" (see @ref has_holes), there are two optimizations we can apply: + * - We can ignore the first step (collapsing of spatial dimensions) by simply re-interpreting the shape + * in TensorInfo + * - Instead of performing transpose here, we can pass the transpose flag to the underlying gemm. The gemm + * may then decide to fuse the transpose with any further transformations + * + * 3. Path 3: Reshape then transpose later + * If the weight tensor has holes, then we use a dedicated @ref CpuReshape, followed by transpose later + * + * 4. Path 4: Fused reshape and transpose + * This is only for quantized types for now (TODO: Remove (COMPMID-6596)). We fall back to a legacy + * non-optimized kernel @ref CpuWeightsReshapeKernel to perform a fused reshape + transpose + * + * Path 1 is the long term solution that we shall migrate to once (if) we adopt fixed weight format for all gemm + * kernels. + * In the short term, Path 2 is the favored, more performant path. + */ + +namespace +{ +/** Initialize reshaped / transformed weight info + * + * @param[in] weights Input weights + * @param[out] reshaped_weights Transformed weights + */ +void initialize_reshaped_weight_info(const ITensorInfo &weights, ITensorInfo &reshaped_weights) +{ + auto_init_if_empty(reshaped_weights, weights); + if (is_data_type_quantized(weights.data_type())) + { + // WT method: FusedReshapeAndTranspose + reshaped_weights.set_tensor_shape(compute_weights_reshaped_shape(weights, /* has_bias */ false)); + } + else + { + TensorShape collapsed_weights = weights.tensor_shape(); + collapsed_weights.collapse(3); + reshaped_weights.set_tensor_shape(collapsed_weights); + } +} +} // namespace + +CpuGemmConv2d::WeightTransformMethod CpuGemmConv2d::get_wt_method(const ITensorInfo &weights) +{ + // TODO: Extend ReinterpretThenTranspose support for quantized data types COMPMID-6596 + if (is_data_type_quantized(weights.data_type())) + { + return WeightTransformMethod::FusedReshapeAndTranspose; + } + return has_holes(weights) ? WeightTransformMethod::ReshapeThenTranspose + : WeightTransformMethod::ReinterpretThenTranspose; +} + +CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info) +{ + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + if (skip_im2col) + { + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); + if (skip_col2im) + { + return {true, true}; + } + } + else + { + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); + if (skip_col2im) + { + return {false, true}; + } + } + + // Default case when we cannot reinterpret the input and output as 3D. + return {false, false}; +} + +CpuGemmConv2d::CpuGemmConv2d() + : _weights_reshape(nullptr), + _weights_reshape_and_transpose_kernel(nullptr), + _im2col_kernel(), + _mm_gemm(), + _mm_gemmlowp(), + _col2im_kernel(), + _reshape(), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _gemm_output_3d(), + _data_layout(DataLayout::NCHW), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _is_prepared(false), + _wt_method(WeightTransformMethod::ReshapeThenTranspose), + _run_wt(true), + _aux_mem(AuxTensorIdx::Count) +{ +} +CpuGemmConv2d::~CpuGemmConv2d() = default; + +void CpuGemmConv2d::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool fixed_format, + arm_compute::WeightFormat weight_format) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, + _skip_im2col, fixed_format, weight_format)); + + // Supported activations in GEMM + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + + if (_is_quantized) + { + TensorInfo tmp_src{*src}; + TensorInfo tmp_weights{*weights}; + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = src->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->data_type(); + + tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) + { + const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); + tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); + } + + // Merge activation with output stage + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + + if (supported_acts.count(act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); + } + + GEMMLowpOutputStageInfo output_info; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); + + _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, + enable_fast_math, false, act_info, fixed_format, weight_format, + false /* pretranspose_B. TODO: COMPMID-6596 */)); + + auto mm_mem_req = _mm_gemmlowp->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } + else + { + // Create GEMMInfo structure + const GEMMInfo &gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format, + true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this + flag is ignored)*/); + // Configure matrix multiply function + _mm_gemm = std::make_unique<CpuGemm>(); + _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + auto mm_mem_req = _mm_gemm->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } +} + +Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool skip_im2col, + bool fixed_format, + arm_compute::WeightFormat weight_format) +{ + const DataType data_type = src->data_type(); + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool is_activation_enabled = act_info.enabled(); + + if (is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo &iqinfo = src->quantization_info(); + const QuantizationInfo &wqinfo = weights->quantization_info(); + const QuantizationInfo &oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + + // Merge activation with output stage + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); + } + + GEMMLowpOutputStageInfo output_info; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); + + // Perform validation step on GEMMLowp + std::unique_ptr<ITensorInfo> input_qa = src->clone(); + std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); + input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); + weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); + + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, + output_info, false, enable_fast_math, false, act_info, + false /* pretranspose_B. TODO: COMPMID-6596 */)); + } + else + { + // Create GEMMInfo structure + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format, + true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this + flag is ignored)*/); + + // Perform validation step on Matrix multiply function + return CpuGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + } +} + +Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col) +{ + const DataType data_type = input_info->data_type(); + const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; + const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; + + // Set dummy tensor shapes for the validation + const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, + input_info->quantization_info()); + const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); + const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, + input_info->quantization_info()); + + return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, + gemm_3d_depth, skip_im2col); +} + +void CpuGemmConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_UNUSED(num_groups, weights_info); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, + num_groups); + + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + + _is_prepared = weights_info.retain_internal_weights(); + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _data_layout = data_layout; + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + const ITensorInfo *gemm_input_to_use = src; + ITensorInfo *gemm_output_to_use = dst; + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), + "Output shape does not match the expected one"); + + // Check if GEMM3D is supported + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + _skip_im2col = skip_info.skip_im2col; + _skip_col2im = skip_info.skip_col2im; + + // Get parameters from conv_info + unsigned int stride_x = 0; + unsigned int stride_y = 0; + std::tie(stride_x, stride_y) = conv_info.stride(); + + // Initialize reshaped weights + initialize_reshaped_weight_info(*weights, _weights_reshaped); + + // Create tensor to store im2col reshaped inputs + if (!_skip_im2col) + { + const int block_by = arm_compute::block_by(weights_info.weight_format()); + unsigned int input_pad_right = 0; + if (block_by > 1) + { + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + } + // Configure + _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>(); + _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, + num_groups, input_pad_right); + + // Update GEMM input + gemm_input_to_use = &_im2col_output; + } + + const unsigned int mat_weights_cols = weights->dimension(idx_kernels); + + // Create temporary GEMM output tensor in case we cannot skip col2im + const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; + if (!_skip_col2im) + { + TensorShape shape_gemm; + + // Calculate GEMM output shape + shape_gemm = _im2col_output.tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + _gemm_output = TensorInfo(shape_gemm, 1, output_data_type); + _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + _gemm_output_3d = TensorInfo(_gemm_output); + + // Update GEMM output + gemm_output_to_use = &_gemm_output; + } + else + { + _gemm_output_3d = TensorInfo(*dst); + _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true); + _gemm_output = TensorInfo(_gemm_output_3d); + + // Update GEMM output + gemm_output_to_use = &_gemm_output_3d; + } + + // Configure GEMM + // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + /** @section note_CpuGemmConv2d_weight_use_in_configure Which weights tensor should we use to configure gemm + * + * A. The problem: + * In principle, we should use the weights tensor corresponding to the weights transformation path. I.e.: + * - If no weight transformation (_run_wt == false): Use original weights + * - else: Use transformed weights + * However in practice we have a dilemma: + * - We need to know _run_wt before we can configure gemm with the corresponding weights, but + * - _run_wt depends on isVarWeightsKernel(), which is only known after gemm is configured + * + * B. The decision: + * To simplify the matter, we decide to always use the transformed weights, regardless of _run_wt + * + * This decision requires the following conditions: + * 1. The underlying gemm where isVarWeightsKernel() == true, must guarantee that: + * A. Ignore the flag to transpose weights (GEMMInfo::pretranspose_B) + * B. Use weights/B tensor passed to it at prepare() or run() instead of that passed at configure() + * 2. CpuGemmConv2d where isVarWeightsKernel() == true, must guarantee that: + * A. Pass original weights instead of reshaped or reinterpreted weights + * + * C. Future actions: + * Condition 2 is a given, based on our implementation. + * If condition 1 cannot hold, we must make changes to the underlying gemm to: + * 1. Either expose isVarWeightsKernel() before gemm is configured somehow, or + * 2. Take in an additional "original_weights" tensor info at configure + */ + configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, + gemm_3d_depth, fixed_format, weights_info.weight_format()); + + // Can only decide isVarWeightsKernel after gemm is configured + _run_wt = !isVarWeightsKernel(); + + if (!_skip_col2im && _data_layout == DataLayout::NCHW) + { + // Configure col2im + _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>(); + _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h)); + } + else + { + // Configure reshape layer + _reshape = std::make_unique<CpuReshape>(); + _reshape->configure(gemm_output_to_use, dst); + } + + // Check lifetime + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + // Add WeightsReshaped memory requirement to workspace + // Note that in case of WeightTransformMethod::ReinterpretThenTranspose, we do not need to allocate this memory + // However since we cannot determine weight transformation method until prepare (see prepare()), we will have to + // settle with allocating more + if (_run_wt) + { + // Check if GEMM transforms weights + // If weight is further transformed by underlying gemm after ReshapeThenTranspose then we can free + // WeightsReshaped in prepare + // Otherwise WeightsReshaped is the final transformation of weights and needs to persist + bool gemm_trans_wei = _aux_mem[GemmAsmPretransposedRHS].size > 0; + gemm_trans_wei = _mm_gemm != nullptr ? _aux_mem[GemmTransposed1xWRHS].size > 0 : gemm_trans_wei; + gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[GemmLowpTransposed1xWRHS].size > 0 : gemm_trans_wei; + + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), + gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, + _weights_reshaped.total_size()); + } + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); +} + +Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) +{ + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + + const bool skip_im2col = skip_info.skip_im2col; + const bool skip_col2im = skip_info.skip_col2im; + const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + + /** @section note_CpuGemmConv2d_weight_use_in_has_opt_impl Which weights tensor should we use for has_opt_impl + * + * For the pretranspose_B flag, this shares a similar problem and thus the same decision as that of + * @ref note_CpuGemmConv2d_weight_use_in_configure + * + * But for the weights, we shall always use the original instead of reshaped weights here + */ + const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, + fixed_format, weights_info.weight_format(), true /* pretranspose_B */); + + return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); +} + +Status CpuGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + + if (!is_fixed_format(weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported"); + + const DataLayout data_layout = src->data_layout(); + const DataType data_type = src->data_type(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + + TensorInfo im2col_reshaped_info{}; + TensorInfo info_gemm{}; + TensorInfo tmp_info{}; + TensorInfo weights_reshaped_info{}; + const ITensorInfo *gemm_input_to_use = src; + const ITensorInfo *gemm_output_to_use = dst; + const ITensorInfo *weights_to_use = weights; + + const bool append_bias = false; + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool is_bf16 = data_type == DataType::BFLOAT16; + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + // Check if GEMM3D is supported + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Validate biases + if (biases != nullptr) + { + if (is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if (is_bf16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != dst->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + unsigned int mat_weights_cols = weights->dimension(idx_kernels); + unsigned int mat_weights_rows = + weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); + + // Initialize reshaped weights + initialize_reshaped_weight_info(*weights, weights_reshaped_info); + // No need to call CpuReshape::validate() or CpuTranspose::validate() as the dst info is auto-configured from the + // src + weights_to_use = &weights_reshaped_info; + + if (!skip_im2col) + { + const int block_by = arm_compute::block_by(weights_info.weight_format()); + int input_pad_right = 0; + if (block_by > 1) + { + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * + (weights->dimension(idx_channel) + input_pad_right); + } + + // Create tensor info for im2col reshaped inputs + // For CPU, the batch size is on the fourth dimension + TensorShape shape_im2col = src->tensor_shape(); + shape_im2col.set(0, mat_weights_rows); + shape_im2col.set(1, conv_w * conv_h); + shape_im2col.set(2, 1); + + im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); + im2col_reshaped_info.set_quantization_info(src->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), + conv_info, append_bias, dilation, num_groups, input_pad_right)); + gemm_input_to_use = &im2col_reshaped_info; + } + + // Create temporary GEMM output tensor in case we cannot skip col2im + const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; + if (!skip_col2im) + { + TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + info_gemm = TensorInfo(shape_gemm, 1, output_data_type); + } + else + { + info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type); + } + info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + gemm_output_to_use = &info_gemm; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + + // See note_CpuGemmConv2d_weight_use_in_configure regarding the choice of the weights + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, + enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, + weights_info.weight_format())); + + // Validate Col2Im/ReshapeLayer + if (!skip_col2im && (data_layout == DataLayout::NCHW)) + { + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); + } + + return Status{}; +} + +void CpuGemmConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto dst = tensors.get_tensor(ACL_DST); + auto gemm_input_to_use = src; + + CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); + CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); + + bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); + if (!_skip_im2col) + { + // Run input reshaping + unsigned int hint_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + unsigned int x_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + unsigned int hint_dim_iterations = _im2col_kernel->window().num_iterations(hint_dim); + unsigned int x_dim_iterations = _im2col_kernel->window().num_iterations(x_dim); + if (hint_dim_iterations < NEScheduler::get().num_threads() && x_dim_iterations > hint_dim_iterations) + { + hint_dim = x_dim; + } + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; + NEScheduler::get().schedule_op(_im2col_kernel.get(), hint_dim, _im2col_kernel->window(), pack); + gemm_input_to_use = im2col_output.get(); + } + + // Handle the case where output has top/bottom padding + const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst; + Tensor gemm3d; + _gemm_output_3d.extend_padding(out_to_use->info()->padding()); + gemm3d.allocator()->soft_init(_gemm_output_3d); + gemm3d.allocator()->import_memory(out_to_use->buffer()); + auto gemm_output_to_use = gemm_output.get(); + + if (_skip_im2col) + { + gemm_output_to_use = &gemm3d; + } + if (_skip_col2im && !out_has_padding) + { + gemm_output_to_use = dst; + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); + gemm_pack.add_tensor(TensorType::ACL_DST, gemm_output_to_use); + // Allocate reshaped weights if required + auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation + const bool use_reinterpreted_wei = (_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose); + CpuAuxTensorHandler reinterpreted_wei( + _weights_reshaped, *weights, + /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */ + !use_reinterpreted_wei); + + const bool use_reshaped_wei = (_run_wt && (_wt_method == WeightTransformMethod::ReshapeThenTranspose || + _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)); + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, + false /* pack_inject */, !use_reshaped_wei /* bypass_alloc */, + !use_reshaped_wei /* bypass_import */ + ); + // Update the weights to use if it has been reshaped + if (use_reinterpreted_wei) + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); + } + else if (use_reshaped_wei) + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + } + + // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions + _is_quantized ? _mm_gemmlowp->run(gemm_pack) : _mm_gemm->run(gemm_pack); + + // Reshape output matrix + if (!_skip_col2im) + { + if (_data_layout == DataLayout::NCHW) + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); + } + else + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; + _reshape->run(pack); + } + } + else if (out_has_padding) + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; + _reshape->run(pack); + } +} + +void CpuGemmConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + // Determine which weights reshape path to take + // Note that this decision can only occur at prepare instead of configure because it relies on the presence of + // any holes in the weight tensor, which may change after configure (e.g. from extending padding) + if (_run_wt) + { + _wt_method = get_wt_method(*(weights->info())); + switch (_wt_method) + { + case (WeightTransformMethod::FusedReshapeAndTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: FusedReshapeAndTranspose"); + _weights_reshape_and_transpose_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>(); + _weights_reshape_and_transpose_kernel->configure(weights->info(), nullptr, &_weights_reshaped); + break; + } + case (WeightTransformMethod::ReshapeThenTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReshapeThenTranspose"); + _weights_reshape = std::make_unique<CpuReshape>(); + _weights_reshape->configure(weights->info(), &_weights_reshaped); + break; + } + case (WeightTransformMethod::ReinterpretThenTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReinterpretThenTranspose"); + // Nothing to configure + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported weight transform method"); + } + } + } + else + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("No weight transformation is performed"); + } + ITensorPack gemm_pack = tensors; + // Allocate reshaped weights if required + CpuAuxTensorHandler reinterpreted_wei( + _weights_reshaped, + *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); + // Run weights reshape if required + if (_run_wt) + { + switch (_wt_method) + { + case (WeightTransformMethod::FusedReshapeAndTranspose): + { + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}}; + NEScheduler::get().schedule_op(_weights_reshape_and_transpose_kernel.get(), Window::DimW, + _weights_reshape_and_transpose_kernel->window(), pack); + weights->mark_as_unused(); + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + break; + } + case (WeightTransformMethod::ReshapeThenTranspose): + { + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}}; + _weights_reshape->run(pack); + weights->mark_as_unused(); + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + break; + } + case (WeightTransformMethod::ReinterpretThenTranspose): + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); + // Nothing to run + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported weight transform method"); + } + } + } + _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack); + + _is_prepared = true; + } +} +experimental::MemoryRequirements CpuGemmConv2d::workspace() const +{ + return _aux_mem; +} +bool CpuGemmConv2d::isVarWeightsKernel() const +{ + return _mm_gemm && _mm_gemm->isVarWeightsKernel(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h new file mode 100644 index 0000000000..48a0d11107 --- /dev/null +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +class CpuGemm; +class CpuGemmLowpMatrixMultiplyCore; +class CpuGemmLowpOutputStage; +class CpuReshape; +namespace kernels +{ +class CpuIm2ColKernel; +class CpuCol2ImKernel; +class CpuWeightsReshapeKernel; +} // namespace kernels + +/** Basic function to compute the convolution layer. @ref note_CpuGemmConv2d_weight_transformation */ +class CpuGemmConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGemmConv2d(const CpuGemmConv2d &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + CpuGemmConv2d(CpuGemmConv2d &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete; + /** Destructor */ + ~CpuGemmConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CpuWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmConvolution::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * The parameter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl + * + * @return a status. + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const bool enable_fast_math = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + /** Configures the appropriate matrix multiply routine + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) + * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. + * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. + */ + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] dst Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) + * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) + * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. + * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. + * + * @return a status + */ + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool skip_im2col = false, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout + * + * @return a status + */ + static Status validate_gemm3d(const ITensorInfo *src, + const ITensorInfo *weights, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col); + + struct SkipInfo + { + bool skip_im2col; + bool skip_col2im; + }; + + /** Static function to provide skip_im2col and skip_col2im information. + * + * @param[in] src Input tensor info. + * @param[in] weights Weights tensor info. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] dilation Dilation, in elements, across x and y. + * @param[in] act_info Activation layer information in case of a fused activation. + * + * @return a SkipInfo instance. + */ + static SkipInfo skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info); + + /** Indicates if the convolution executes in variable weights mode. + * + * Similar to @ref CpuGemm::isVarWeightsKernel + */ + bool isVarWeightsKernel() const; + enum AuxTensorIdx + { + GemmAsmPretransposedRHS = 2, // CpuGemmAssemblyDispatch::Pretranspose + GemmTransposed1xWRHS = 5, // CpuGemm::Transposed1xWRHS + GemmLowpTransposed1xWRHS = 6, // CpuGemmLowpMatrixMultiplyCore::TmpB + /* Slots 0 - 9 reserved and shared by CpuGemmLowpMatrixMultiplyCore and CpuGemm */ + Im2ColOutput = 10, + WeightsReshaped, + GemmOutput, + Count + }; + + /** Weight transformation method. See @ref note_CpuGemmConv2d_weight_transformation */ + enum class WeightTransformMethod + { + ReinterpretThenTranspose, + ReshapeThenTranspose, + FusedReshapeAndTranspose, + }; + + /** Select weight transformation method + * + * @param[in] weights Input weights + * + * @return WeightTransformMethod + */ + static WeightTransformMethod get_wt_method(const ITensorInfo &weights); + + std::unique_ptr<CpuReshape> _weights_reshape; + std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_and_transpose_kernel; + std::unique_ptr<kernels::CpuIm2ColKernel> _im2col_kernel; + std::unique_ptr<CpuGemm> _mm_gemm; + std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + std::unique_ptr<kernels::CpuCol2ImKernel> _col2im_kernel; + std::unique_ptr<CpuReshape> _reshape; + + TensorInfo _im2col_output; + TensorInfo _weights_reshaped; + TensorInfo _gemm_output; + TensorInfo _gemm_output_3d; + + DataLayout _data_layout; + + bool _skip_im2col; + bool _skip_col2im; + bool _is_quantized; + bool _is_prepared; + WeightTransformMethod _wt_method; + bool _run_wt; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp new file mode 100644 index 0000000000..9187927541 --- /dev/null +++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" +#include "support/Cast.h" + +#include <set> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; + +namespace +{ +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = src->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->data_type(); + // Merge activation with output stage + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + if (supported_acts.count(act.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); + } + GEMMLowpOutputStageInfo os_info; + os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + os_info.gemmlowp_offset = uoqinfo.offset; + os_info.gemmlowp_min_bound = min_activation; + os_info.gemmlowp_max_bound = max_activation; + os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); + return os_info; +} +cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; + asm_info.ps_info = info.conv_info; + asm_info.activation_info = info.act_info; + asm_info.depth_output_gemm3d = true; + asm_info.reinterpret_input_as_3d = true; + asm_info.padding_top = info.conv_info.pad_top(); + asm_info.padding_left = info.conv_info.pad_left(); + asm_info.padding_value = 0.f; + asm_info.negated_offsets = false; + asm_info.fast_mode = info.enable_fast_math; + asm_info.fixed_format = info.weights_info.weight_format() != WeightFormat::UNSPECIFIED; + asm_info.weight_format = info.weights_info.weight_format(); + return asm_info; +} +} // namespace + +CpuGemmDirectConv2d::CpuGemmDirectConv2d() + : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()), + _activation_func(std::make_unique<CpuActivation>()), + _weights_permute_func(std::make_unique<CpuPermute>()), + _aux_mem(AuxTensorIdx::Count), + _perm_weights(), + _run_activation(false), + _is_prepared(false) +{ +} + +CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; + +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); + + _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); + _is_prepared = false; + + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2}); + + // Configure assembly dispatch + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + if (is_data_type_quantized(src->data_type())) + { + asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); + } + _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); + + // Configure activation + if (_run_activation) + { + _activation_func->configure(dst, nullptr, info.act_info); + } + + // Add auxiliary memory requirements of the assembly dispatch + const auto asm_mem_req = _gemm_asm_func->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + + if (_aux_mem[Pretranspose].size > 0) + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + } + else + { + // We must permute weights if they are WeightFormat::UNSPECIFIED + if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); + } +} +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + if (!is_fixed_format(info.weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); + const DataType data_type = src->data_type(); + const TensorShape i_shape = src->tensor_shape(); + const TensorShape w_shape = weights->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + // Validate biases + if (biases != nullptr) + { + if (is_data_type_quantized_asymmetric(data_type)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if (data_type == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info)); + return Status{}; +} +void CpuGemmDirectConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + _gemm_asm_func->run(tensors); + if (_run_activation) + { + ITensor *io = tensors.get_tensor(ACL_DST); + ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}}; + _activation_func->run(pack); + } +} + +void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + // If we are using fixed-format kernel the weights are already reshaped + if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) + { + _gemm_asm_func->prepare(tensors); + _is_prepared = true; + return; + } + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); + + CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; + _weights_permute_func->run(permute_tensors); + + tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); + // Call prepare of assembly dispatch + _gemm_asm_func->prepare(tensors); + + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h new file mode 100644 index 0000000000..a7365615b9 --- /dev/null +++ b/src/cpu/operators/CpuGemmDirectConv2d.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; +struct Conv2dInfo; +namespace cpu +{ +class CpuGemmDirectConv2d : public ICpuOperator +{ +public: + CpuGemmDirectConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); + ~CpuGemmDirectConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d + * + * Similar to CpuGemmDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + GemmTemp0 = 0, + GemmTemp1, + Pretranspose, + /* Slots above (0-2) are reserved for CpuGemmAssemblyDispatch */ + PermutedWeights, + Count + }; + + std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<CpuPermute> _weights_permute_func; + experimental::MemoryRequirements _aux_mem; + TensorInfo _perm_weights; + bool _run_activation; + bool _is_prepared; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp new file mode 100644 index 0000000000..f3396fbb5c --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -0,0 +1,779 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/TensorAllocator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); + asm_info.accumulate = info.accumulate(); + + return asm_info; +} +} // namespace + +CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() + : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()), + _mm_kernel(), + _mtx_a_reshape_kernel(), + _mtx_b_reshape_kernel(), + _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), + _offset_contribution_output_stage_kernel(), + _activation_func(), + _convert_to_signed_asymm(), + _convert_from_signed_asymm(), + _vector_sum_col(), + _vector_sum_row(), + _tmp_a(), + _tmp_b(), + _mm_result_s32(), + _signed_a(), + _signed_output(), + _a_offset(0), + _b_offset(0), + _run_vector_matrix_multiplication(false), + _assembly_path(false), + _fused_assembly_path(false), + _reshape_b_only_on_first_run(false), + _is_prepared(false), + _fuse_output_stage(false), + _run_activation(false), + _flip_signedness(false), + _gemm_info(), + _aux_mem(Count) +{ +} +CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; + +void CpuGemmLowpMatrixMultiplyCore::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info); + + const ITensorInfo *matrix_a = a; + const ITensorInfo *matrix_b = b; + GEMMInfo info = gemm_info; + + // Set internal variables + _a_offset = a->quantization_info().uniform().offset; + _b_offset = b->quantization_info().uniform().offset; + _run_vector_matrix_multiplication = a->dimension(1) < 2; + _reshape_b_only_on_first_run = b->are_values_constant(); + _is_prepared = false; + _fused_assembly_path = false; + _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && + _reshape_b_only_on_first_run; + _gemm_info = gemm_info; + + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + // It is not needed if the datatype is symmetric, because there is no offset + bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic(); + + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + + const ITensorInfo *a_to_use = a; + + // Convert to QASYMM8 -> QASYMM8_SIGNED and back + if (_flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); + _convert_to_signed_asymm->configure(a_to_use, &_signed_a); + a_to_use = &_signed_a; + _a_offset = _signed_a.quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + _signed_output = dst->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + + // Update matrix a + matrix_a = &_signed_a; + } + + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); + } + + // Initialize assembly kernel meta-data + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); +#ifdef __aarch64__ + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + { + switch (a->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::U8: + case DataType::S8: + { + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + auto c_info_to_use = c == nullptr ? nullptr : c; + _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); + _fused_assembly_path = _asm_glue->is_configured(); + } + else + { + auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst); + _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info); + } + _assembly_path = _asm_glue->is_configured(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } + } + } +#endif /* __aarch64__ */ + if (!(_assembly_path || _run_vector_matrix_multiplication)) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + _tmp_a = + TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); + + // Configure interleave kernel + _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>(); + _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); + + // Configure transpose kernel + _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>(); + _mtx_b_reshape_kernel->configure(b, &_tmp_b); + } + + if (!_fused_assembly_path) + { + // Build reduction info + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); + + if (a_offset_kernel_needed) + { + _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>(); + _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); + } + + if (b_offset_kernel_needed) + { + _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>(); + _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); + } + + if (_fuse_output_stage) + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); + _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); + } + + _offset_contribution_output_stage_kernel = + std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); + _offset_contribution_output_stage_kernel->configure( + &_mm_result_s32, a_offset_kernel_needed ? &_vector_sum_col : nullptr, + b_offset_kernel_needed ? &_vector_sum_row : nullptr, c, _flip_signedness ? &_signed_output : dst, + a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage()); + + if (_flip_signedness) + { + _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); + _convert_from_signed_asymm->configure(&_signed_output, dst); + } + } + else + { + // This scale is needed for the s8_f32 kernel where the multiplication output is dequantized to F32. + const float dequantize_scale = + (dst->data_type() == DataType::F32) + ? a->quantization_info().uniform().scale * b->quantization_info().uniform().scale + : 1.0f; + // Configure matrix multiply kernel + if (!_assembly_path) + { + _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); + _mm_kernel->configure(matrix_a, matrix_b, dst); + } + // Configure offset contribution kernel + _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>(); + _offset_contribution_kernel->configure(dst, a_offset_kernel_needed ? &_vector_sum_col : nullptr, + b_offset_kernel_needed ? &_vector_sum_row : nullptr, + a_to_use->dimension(0), _a_offset, _b_offset, dequantize_scale); + } + } + // Configure activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + _run_activation = + activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if (_run_activation) + { + _activation_func = std::make_unique<CpuActivation>(); + _activation_func->configure(dst, nullptr, activation); + } + + if (_assembly_path) + { + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + } + + // Request memory for LHS and RHS reshape matrix + _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), + !_fused_assembly_path && a_offset_kernel_needed && _reshape_b_only_on_first_run + ? MemoryLifetime::Persistent + : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + _aux_mem[VectorSumRow] = + MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _tmp_b.total_size()); + _aux_mem[MMResultS32] = + MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); + _aux_mem[SignedOutput] = + MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); +} + +Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && output->data_type() != DataType::F32 && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + + // When using accumulation(in place summation), for now, the only supported DataType for output is S32. + if (gemm_info.accumulate()) + { +#ifdef __arm__ + ARM_COMPUTE_RETURN_ERROR_MSG("Accumulation is not supported for armv7"); +#endif /* __arm__ */ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE, + "Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED"); + } + + GEMMInfo info = gemm_info; + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b; + + const ITensorInfo *a_to_use = a; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo mm_result_s32_info{}; + + int32_t a_offset = a->quantization_info().uniform().offset; + int32_t b_offset = b->quantization_info().uniform().offset; + + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic(); + + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + if (fuse_output_stage) + { + auto_init_if_empty(mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + } + + // Convert QASYMM8->QASYMM8_SIGNED + TensorInfo signed_a{}; + TensorInfo signed_output{}; + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if (flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); + a_to_use = &signed_a; + a_offset = signed_a.quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); + signed_output = output->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + + // Update matrix a + matrix_a_info = &signed_a; + } + + // Initialize assembly kernel meta-data + const AsmGemmInfo asm_info = init_assembly_metadata(info); + + // Check if we need to run the optimized assembly kernel + bool run_optimised = false; + bool run_optimised_requantized = false; + + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + { + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); + run_optimised_requantized = run_optimised; + } + else + { + run_optimised = bool(CpuGemmAssemblyDispatch::validate( + a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); + } + } + + if (run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if (info.depth_output_gemm3d() != 0) + { + if (info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if (!run_vector_matrix_multiplication) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); + } + } + + if (!run_optimised_requantized) + { + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset_kernel_needed) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset_kernel_needed) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); + } + + if (fuse_output_stage) + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, + b_offset_kernel_needed ? &info_vector_sum_row : nullptr, c, flip_signedness ? &signed_output : output, + a_offset, b_offset, info.gemmlowp_output_stage())); + } + else + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + } + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, + b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset)); + } + } + + // Validate activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if (activation.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); + } + + return Status{}; +} + +void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + auto a_to_use = a; + auto matrix_a = a; + auto matrix_b = b; + + CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false); + CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false); + CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false); + CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true); + CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false); + CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false); + CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); + + const QuantizationInfo a_qinfo = a->info()->quantization_info(); + const QuantizationInfo b_qinfo = b->info()->quantization_info(); + + if (a_qinfo.is_dynamic()) + _a_offset = a_qinfo.uniform().offset; + if (b_qinfo.is_dynamic()) + _b_offset = b_qinfo.uniform().offset; + + // Convert QASYMM8->QASYMM8_SIGNED + if (_flip_signedness) + { + ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}}; + NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), + pack); + a_to_use = signed_a.get(); + matrix_a = signed_a.get(); + } + + // Run GEMM + if (_asm_glue->is_configured()) + { + ITensorPack asm_glue_tensors = tensors; + auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); + if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && + _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); + asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst); + } + else + { + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); + asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); + } + _asm_glue->run(asm_glue_tensors); + } + else + { + if (!_run_vector_matrix_multiplication) + { + matrix_a = tmp_a.get(); + matrix_b = tmp_b.get(); + // Run interleave kernel + ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), + pack_a); + + if (!_reshape_b_only_on_first_run) + { + ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}}; + // Run transpose kernel + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, + _mtx_b_reshape_kernel->window(), pack_b); + } + } + ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}}; + if (_fuse_output_stage) + { + pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); + } + else + { + pack_mm.add_tensor(TensorType::ACL_DST, dst); + } + NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); + } + + if (!_fused_assembly_path) + { + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, + _mtx_a_reduction_kernel->window(), pack); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); + } + + if (_fuse_output_stage) + { + if (a_qinfo.is_dynamic()) + _offset_contribution_output_stage_kernel->set_a_offset(_a_offset); + if (b_qinfo.is_dynamic()) + _offset_contribution_output_stage_kernel->set_b_offset(_b_offset); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); + pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get()); + pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get()); + pack.add_tensor(TensorType::ACL_SRC_3, c); + pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); + + // Run offset contribution kernel + NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, + _offset_contribution_output_stage_kernel->window(), pack); + } + else + { + if (a_qinfo.is_dynamic()) + _offset_contribution_kernel->set_a_offset(_a_offset); + if (b_qinfo.is_dynamic()) + _offset_contribution_kernel->set_b_offset(_b_offset); + if (a_qinfo.is_dynamic() || b_qinfo.is_dynamic()) + { + const float dequantize_scale = a_qinfo.uniform().scale * b_qinfo.uniform().scale; + _offset_contribution_kernel->set_scale(dequantize_scale); + } + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get()); + pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get()); + pack.add_tensor(TensorType::ACL_DST, dst); + + // Run offset contribution kernel + NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, + _offset_contribution_kernel->window(), pack); + } + } + + // Convert QASYMM8_SIGNED->QASYMM8 + if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + { + ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, + _convert_from_signed_asymm->window(), pack); + } + + // Run fused activation unless already run in the fused assembly + if (_run_activation) + { + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; + _activation_func->run(pack); + } +} + +void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + // Run assembly reshape + if (_asm_glue->is_configured()) + { + _asm_glue->prepare(tensors); + } + // Run non-assembly reshape + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + { + // Run reshape kernel and mark original weights tensor as unused + ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); + CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), + pack); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + { + ITensor *vector_sum_col_p = + utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); + CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); + } + _is_prepared = true; + } +} +experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h new file mode 100644 index 0000000000..38121c9bb4 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/GEMMInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +class CpuGemmInterleave4x4Kernel; +class CpuGemmLowpMatrixMultiplyKernel; +class CpuGemmLowpOffsetContributionKernel; +class CpuGemmLowpOffsetContributionOutputStageKernel; +class CpuGemmLowpMatrixAReductionKernel; +class CpuGemmLowpMatrixBReductionKernel; +class CpuGemmTranspose1xWKernel; +class CpuConvertQuantizedSignednessKernel; +} // namespace kernels +class CpuGemmAssemblyDispatch; +class CpuActivation; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: + * + * -# @ref kernels::CpuGemmInterleave4x4Kernel + * -# @ref kernels::CpuGemmTranspose1xWKernel + * -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel + * -# @ref kernels::CpuGemmLowpOffsetContributionKernel + * -# @ref CpuActivation + * + * otherwise if the DOT product instruction is available: + * + * -# @ref kernels::CpuGemmLowpOffsetContributionKernel + * +*/ +class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmLowpMatrixMultiplyCore(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore); + /** Destructor */ + ~CpuGemmLowpMatrixMultiplyCore(); + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8 |S32 |S32 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32 |F32 | + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise + * + * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32/F32 + * @param[out] dst Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixMultiplyCore::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + VectorSumCol = 3, + VectorSumRow, + TmpA, + TmpB, + MMResultS32, + SignedA, + SignedOutput, + Count + }; + + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue; + std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel> _mm_kernel; + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _mtx_a_reshape_kernel; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _mtx_b_reshape_kernel; + std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_to_signed_asymm; + std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_from_signed_asymm; + + TensorInfo _vector_sum_col; + TensorInfo _vector_sum_row; + TensorInfo _tmp_a; + TensorInfo _tmp_b; + TensorInfo _mm_result_s32; + TensorInfo _signed_a; + TensorInfo _signed_output; + int32_t _a_offset; + int32_t _b_offset; + + bool _run_vector_matrix_multiplication; + bool _assembly_path; + bool _fused_assembly_path; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _run_activation; + bool _flip_signedness; + GEMMInfo _gemm_info; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp new file mode 100644 index 0000000000..4215eed199 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuGemmLowpOutputStage::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + switch (info.output_data_type) + { + case DataType::QASYMM8: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + case DataType::QASYMM8_SIGNED: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + case DataType::QSYMM16: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, + info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported output data type."); + break; + } + } + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + switch (info.output_data_type) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel>(); + k->configure(src, bias, dst, &info); + _kernel = std::move(k); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported output data type."); + break; + } + } + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); + } +} + +Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, + "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && + (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + switch (dst->data_type()) + { + case DataType::QASYMM8: + return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + case DataType::QASYMM8_SIGNED: + return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + case DataType::QSYMM16: + return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); + } + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + switch (dst->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); + } + } + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); + } +} + +void CpuGemmLowpOutputStage::run(ITensorPack &tensors) +{ + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h new file mode 100644 index 0000000000..e5e2f41fa9 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpOutputStage.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H + +#include "arm_compute/core/Types.h" + +#include "src/cpu/ICpuOperator.h" + +/** This file contains all available output stages for GEMMLowp. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute GEMMLowpQuantizeDown kernels. + * + * This function calls the following kernels: + * + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel +*/ +class CpuGemmLowpOutputStage : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:-------------| + * |S32 |S32 |QASYMM8 | + * |S32 |S32 |QASYMM8_SIGNED| + * |S32 |S32 |QSYMM16 | + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16 + * @param[in] info GEMMLowp output stage metadata. + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOutputStage::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */ diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp new file mode 100644 index 0000000000..f68ae9883f --- /dev/null +++ b/src/cpu/operators/CpuMatMul.cpp @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuMatMul.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/MatMulInfo.h" +#include "arm_compute/runtime/NEON/functions/NEMatMul.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +{ + const auto data_type = src->data_type(); + const QuantizationInfo oq_info = dst->quantization_info(); + const UniformQuantizationInfo iq_unif = src->quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; + int32_t output_multiplier; + int32_t output_shift; + + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + int32_t type_min = 0; + int32_t type_max = 0; + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); + + gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage_info.gemmlowp_shift = output_shift; + gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; + gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage_info.gemmlowp_min_bound = type_min; + gemmlowp_output_stage_info.gemmlowp_max_bound = type_max; + + return Status{}; +} +} // namespace + +CpuMatMul::CpuMatMul() + : _transpose_kernel_lhs(), + _transpose_kernel_rhs(), + _asm_glue(), + _lhs_transposed(), + _rhs_transposed(), + _original_lhs_shape(), + _original_rhs_shape(), + _original_dst_shape() +{ +} + +Status CpuMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::BFLOAT16, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic."); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(lhs); + + const auto adj_lhs = info.adj_lhs(); + const auto adj_rhs = info.adj_rhs(); + + const ITensorInfo *lhs_to_use = lhs; + const ITensorInfo *rhs_to_use = rhs; + TensorInfo lhs_transposed{}; + TensorInfo rhs_transposed{}; + + auto gemm_info = AsmGemmInfo(); + gemm_info.activation_info = act_info; + gemm_info.fast_mode = settings.fast_math(); + gemm_info.fixed_format = settings.fixed_format(); + + // Validate and then permute a/b + if (adj_lhs) + { + auto_init_if_empty(lhs_transposed, + lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed)); + // Assign lhs_to_use pointer to use transposed TensorInfo + lhs_to_use = &lhs_transposed; + } + if (adj_rhs) + { + auto_init_if_empty(rhs_transposed, + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed)); + // Assign rhs_to_use pointer to use transposed TensorInfo + rhs_to_use = &rhs_transposed; + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the " + "number of rows in B (after transpose)"); + + // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors + for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), + "Broadcasting in Batch dimension is unsupported by this operator."); + } + + // Quantized-specific configuration + if (is_data_type_quantized(lhs->data_type())) + { + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, + gemm_info.activation_info, gemm_info.output_stage)); + } + + if (gemm_info.fixed_format) + { + gemm_info.weight_format = WeightFormat::ANY; + arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use, + rhs_to_use, nullptr, dst, gemm_info)); + } + + cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); + + return Status{}; +} + +void CpuMatMul::configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); + ARM_COMPUTE_ERROR_THROW_ON(CpuMatMul::validate(lhs, rhs, dst, info, settings)); + + _adj_lhs = info.adj_lhs(); + _adj_rhs = info.adj_rhs(); + _fast_math = settings.fast_math(); + + // 1. Create and reshape tensors + // ------------------------------------------------------ + // a. Clone TensorInfo to prevent changing original tensor values during setup + // b. Change shape of lhs/dst to [x, y, 1, collapsed(z)] to match assembly kernel configuration + // c. For rhs collapse all dimensions larger than 3 to z dimension + TensorInfo lhs_to_use = *lhs->clone(); + TensorInfo dst_to_use = *dst->clone(); + TensorInfo rhs_to_use = *rhs->clone(); + + // Save starting shape of tensors + _original_lhs_shape = lhs_to_use.tensor_shape(); + _original_dst_shape = dst_to_use.tensor_shape(); + _original_rhs_shape = rhs_to_use.tensor_shape(); + + // Reshape lhs for use with assembly kernels. + lhs_to_use.set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); + dst_to_use.set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); + rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2)); + + // 2. Configuration for transpose of lhs/rhs + // ------------------------------------------------------ + // Initialise transposed TensorInfo class for aux tensors (intermediary tensors) + if (_adj_lhs) + { + // Setup transpose LHS + _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); + _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); + } + + if (_adj_rhs) + { + // Setup transpose RHS + _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); + _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed); + } + + // 3. Configure assembly kernel using transposed tensors. + // ----------------------------------------------------- + // Use transposed tensors if the corresponding transpose flags are set + // Fill AsmGemmInfo class object before configuration + _gemm_info.activation_info = act_info; + _gemm_info.fast_mode = settings.fast_math(); + _gemm_info.fixed_format = settings.fixed_format(); + _gemm_info.negated_offsets = false; + + lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use; + rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; + + // Quantized-specific configuration + if (is_data_type_quantized(lhs->data_type())) + { + get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, + _gemm_info.output_stage); + } + + if (_gemm_info.fixed_format) + { + _gemm_info.weight_format = WeightFormat::ANY; + arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use, + &rhs_to_use, nullptr, dst, _gemm_info)); + // Set gemm weights info to the one returned by has_opt_impl + _gemm_info.weight_format = expected_weight_format; + // has_opt_impl may return a non fast math kernel, even if we requested one + _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format); + } + + // Configure Asm Kernel + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, + _gemm_info); // c is nullptr as bias not supported in MatMul + + // Specify memory requirements for intermediate tensors + auto asm_mem_req = _asm_glue->workspace(); + // Specify memory required by gemm kernel + int idx = 0; + for (const auto &aux : asm_mem_req) + { + _aux_mem[idx] = aux; + idx++; + } + // Memory requirements for transposed tensors + _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size()); + _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size()); +} + +void CpuMatMul::run(ITensorPack &tensors) +{ + // Retrieve tensors from tensor pack + auto lhs = tensors.get_tensor(ACL_SRC_0); + auto rhs = tensors.get_const_tensor(ACL_SRC_1); + auto dst = tensors.get_tensor(ACL_DST); + + // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm) + // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly) + lhs->info()->set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, + _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + dst->info()->set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, + _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2)); + + // Initialise object to handle stored transposed tensors in auxillary memory + CpuAuxTensorHandler lhs_transposed(offset_int_vec(TransposeLHS), _lhs_transposed, tensors, true); + CpuAuxTensorHandler rhs_transposed(offset_int_vec(TransposeRHS), _rhs_transposed, tensors, true); + + // Create tensor pack for asm kernel + ITensorPack asm_tensors(tensors); + + // Run transpose lhs if necessary + if (_adj_lhs) + { + ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), + lhs_transpose_pack); + asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get()); + } + // Run transpose rhs if necessary + if (_adj_rhs) + { + ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), + rhs_transpose_pack); + asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get()); + } + // Run asm kernel + _asm_glue->run(asm_tensors); + + // Undo reshape of tensors + dst->info()->set_tensor_shape(_original_dst_shape); + lhs->info()->set_tensor_shape(_original_lhs_shape); + rhs->info()->set_tensor_shape(_original_rhs_shape); +} + +experimental::MemoryRequirements CpuMatMul::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h new file mode 100644 index 0000000000..2b1b4cf0ff --- /dev/null +++ b/src/cpu/operators/CpuMatMul.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL_H +#define ACL_SRC_CPU_OPERATORS_CPUMATMUL_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +// Forward Declarations +class MatMulInfo; +class CpuMatMulSettings; + +namespace cpu +{ +/** Function to execute MatMul Operation. This function calls the following functions/kernels: + * + * If adjoint/adj flag is enabled for either input lhs or rhs (or both) : + * -# @ref cpu::kernels::CpuTransposeKernel + * Then : + * -# @ref cpu::CpuGemmAssemblyDispatch + */ +class CpuMatMul : public ICpuOperator +{ +public: + /* Constructor */ + CpuMatMul(); + /* Destructor */ + ~CpuMatMul() = default; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMatMul); + /** Configure operator for a given list of arguments + * + * Note: Check documentation of @ref NEMatMul for a list of supported datatypes and layouts + * + * + * @param[in] lhs Left-hand side tensor info. + * @param[in] rhs Right-hand side tensor info. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings The settings for matmul operation (i.e fast math) + * @param[in] act_info Class containing information about fused activation function. + */ + void configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuMatMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + TransposeLHS = 3, + TransposeRHS, + Count + }; + + // Define unique pointers to kernels/operators used by matmul + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr}; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + + // TensorInfo for tensors stored in auxillary memory + TensorInfo _lhs_transposed{}; + TensorInfo _rhs_transposed{}; + + // Original tensor shapes prior to reshaping tensors and collapsing dimensions + TensorShape _original_lhs_shape{}; + TensorShape _original_rhs_shape{}; + TensorShape _original_dst_shape{}; + + // Note : adj_lhs means the same as transposing lhs + bool _adj_lhs{false}; + bool _adj_rhs{false}; + bool _fast_math{false}; + AsmGemmInfo _gemm_info{}; + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUMATMUL_H diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp new file mode 100644 index 0000000000..697fc40ab3 --- /dev/null +++ b/src/cpu/operators/CpuMaxUnpooling.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuMaxUnpooling.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuMaxUnpooling::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info); + auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>(); + k->configure(src, indices, dst, pool_info); + _kernel = std::move(k); +} + +Status CpuMaxUnpooling::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h new file mode 100644 index 0000000000..5dc00bce9e --- /dev/null +++ b/src/cpu/operators/CpuMaxUnpooling.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MAXUNPOOLING_H +#define ARM_COMPUTE_CPU_MAXUNPOOLING_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuMaxUnpoolingLayerKernel */ +class CpuMaxUnpooling : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] indices Tensor containing the offset to store the src elements in the dst tensor. + * @ref CpuMaxUnpooling with indices should precede this function in order to + * properly reconstruct the output tensor. + * The tensor shape of this tensor has to be equal to the src tensor shape. Data type supported: U32. + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMaxUnpooling::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MAXUNPOOLING_H */ diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp new file mode 100644 index 0000000000..ac9847111d --- /dev/null +++ b/src/cpu/operators/CpuMul.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuMul.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMulKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +Status CpuMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); +} + +void CpuMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + + auto k = std::make_unique<kernels::CpuMulKernel>(); + k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy); + _kernel = std::move(k); +} + +void CpuMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuMulKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} + +Status CpuComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuComplexMulKernel::validate(src1, src2, dst); +} + +void CpuComplexMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + + auto k = std::make_unique<kernels::CpuComplexMulKernel>(); + k->configure(src1, src2, dst); + _kernel = std::move(k); +} + +void CpuComplexMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h new file mode 100644 index 0000000000..82b309830b --- /dev/null +++ b/src/cpu/operators/CpuMul.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2016-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_H +#define ARM_COMPUTE_CPU_MUL_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuMulKernel */ +class CpuMul : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst and convertion policy. + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst dst tensor info. Data types supported: + * - U8, only if both inputs are U8. + * - QASYMM8, only if both inputs are QASYMM8. + * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. + * - S16. + * - QSYMM16, only if both inputs are QSYMM16. + * - S32, only if both inputs are S32 or both are QSYMM16. + * - F16, only if @p src1 is F16. + * - F32, only if both inputs are F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; + +/** Basic function to run @ref kernels::CpuComplexMulKernel */ +class CpuComplexMul : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst. + * + * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_H */ diff --git a/src/cpu/operators/CpuPRelu.h b/src/cpu/operators/CpuPRelu.h new file mode 100644 index 0000000000..084474e2ba --- /dev/null +++ b/src/cpu/operators/CpuPRelu.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PRELU_H +#define ARM_COMPUTE_CPU_PRELU_H + +#include "src/cpu/operators/CpuElementwise.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */ +using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_PRELU_H */
\ No newline at end of file diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp new file mode 100644 index 0000000000..25acc92d00 --- /dev/null +++ b/src/cpu/operators/CpuPermute.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPermute.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPermuteKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, perm); + auto k = std::make_unique<kernels::CpuPermuteKernel>(); + k->configure(src, dst, perm); + _kernel = std::move(k); +} + +Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + return kernels::CpuPermuteKernel::validate(src, dst, perm); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPermute.h b/src/cpu/operators/CpuPermute.h new file mode 100644 index 0000000000..0e0f3ae8db --- /dev/null +++ b/src/cpu/operators/CpuPermute.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PERMUTE_H +#define ARM_COMPUTE_CPU_PERMUTE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuPermuteKernel */ +class CpuPermute : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuPermute::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_PERMUTE_H */ diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp new file mode 100644 index 0000000000..b72bde6978 --- /dev/null +++ b/src/cpu/operators/CpuPool2d.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPool2d.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPool2dKernel.h" +#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuPool2d::CpuPool2d() + : _pooling_layer_kernel(), + _asm_glue(), + _is_global_pooling_layer(false), + _use_kernel_indices(false), + _data_layout(DataLayout::NCHW), + _aux_mem(1) +{ +} + +CpuPool2d::~CpuPool2d() = default; + +void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices); + + // Check if we can run assembly kernels. Currently, indices are not supported by those kernels + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + // Get data layout + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + + // Check if we have Global Pooling Layer + const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && + (src->dimension(idx_height) == pool_info.pool_size.height); + _use_kernel_indices = pool_info.use_kernel_indices; + + if (run_optimised) + { + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>(); + ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); + pooling_wrapper->configure(src, dst, pool_info, ci); + + // Get kernel's memory requirements + constexpr size_t alignment = 4096; + const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); + + _asm_glue = std::move(pooling_wrapper); + } + else + { + // Configure pooling kernel + auto k = std::make_unique<kernels::CpuPool2dKernel>(); + k->configure(src, dst, pool_info, indices); + _pooling_layer_kernel = std::move(k); + } +} + +Status CpuPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) +{ + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + if (run_optimised) + { + return Status{}; + } + + return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices); +} + +void CpuPool2d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); + + if (_asm_glue) + { + const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; + NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); + } + else + { + switch (_data_layout) + { + case DataLayout::NCHW: + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + _is_global_pooling_layer ? Window::DimZ : Window::DimY, + _pooling_layer_kernel->window(), tensors); + break; + case DataLayout::NHWC: + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + (_use_kernel_indices ? Window::DimY : Window::DimX), + _pooling_layer_kernel->window(), tensors); + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + } +} + +experimental::MemoryRequirements CpuPool2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h new file mode 100644 index 0000000000..ea73e3f335 --- /dev/null +++ b/src/cpu/operators/CpuPool2d.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_H +#define ARM_COMPUTE_CPU_POOL2D_H + +#include "arm_compute/core/experimental/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +// Forward Declarations +struct PoolingLayerInfo; + +namespace cpu +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref NEFillBorderKernel (executed if padding size is different from zero) + * -# @ref kernels::CpuPool2dKernel + * -# @ref kernels::CpuPool2dAssemblyWrapperKernel + */ +class CpuPool2d : public ICpuOperator +{ +public: + CpuPool2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d); + ~CpuPool2d(); + /** Set the src and dst tensors. + * + * @note F16 is supported for pool sizes 2 and 3 only + * + * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<INEKernel> _pooling_layer_kernel; + std::unique_ptr<INEKernel> _asm_glue; + + bool _is_global_pooling_layer; + bool _use_kernel_indices; + DataLayout _data_layout; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_H */ diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp new file mode 100644 index 0000000000..7fa78c1f80 --- /dev/null +++ b/src/cpu/operators/CpuPool3d.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPool3d.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Scheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPool3dKernel.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuPool3d::CpuPool3d() : _aux_mem(1) +{ +} + +CpuPool3d::~CpuPool3d() = default; + +void CpuPool3d::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info); + + // Configure pooling kernel + auto k = std::make_unique<kernels::CpuPool3dKernel>(); + k->configure(src, dst, pool_info); + _kernel = std::move(k); +} + +Status CpuPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + return kernels::CpuPool3dKernel::validate(src, dst, pool_info); +} + +void CpuPool3d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); + + Scheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} + +experimental::MemoryRequirements CpuPool3d::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h new file mode 100644 index 0000000000..235d798095 --- /dev/null +++ b/src/cpu/operators/CpuPool3d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL3D_H +#define ARM_COMPUTE_CPU_POOL3D_H + +#include "arm_compute/core/experimental/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref kernels::CpuPool3dKernel + */ +class CpuPool3d : public ICpuOperator +{ +public: + CpuPool3d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3d); + ~CpuPool3d(); + /** Set the src and dst tensors. + * + * + * @param[in] src Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL3D_H */ diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp new file mode 100644 index 0000000000..4a3f1827c7 --- /dev/null +++ b/src/cpu/operators/CpuQuantize.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuQuantize.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuQuantizeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst)); + return Status{}; +} + +void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, dst); + + // Configure quantize kernel + auto k = std::make_unique<kernels::CpuQuantizeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +void CpuQuantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuQuantize.h b/src/cpu/operators/CpuQuantize.h new file mode 100644 index 0000000000..ec1134fee4 --- /dev/null +++ b/src/cpu/operators/CpuQuantize.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_QUANTIZE_H +#define ARM_COMPUTE_CPU_QUANTIZE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */ +class CpuQuantize : public ICpuOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuQuantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */ diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp new file mode 100644 index 0000000000..a423abb49a --- /dev/null +++ b/src/cpu/operators/CpuReshape.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuReshape.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuReshapeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuReshapeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuReshapeKernel::validate(src, dst); +} + +void CpuReshape::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + if (!_is_prepared) + { + static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors); + _is_prepared = true; + } + const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h new file mode 100644 index 0000000000..33da792319 --- /dev/null +++ b/src/cpu/operators/CpuReshape.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_RESHAPE_H +#define ARM_COMPUTE_CPU_RESHAPE_H + +#include "arm_compute/core/Window.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuReshapeKernel */ +class CpuReshape : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination info. Data type supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuReshape::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + bool _is_prepared{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_RESHAPE_H */ diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp new file mode 100644 index 0000000000..7df9296931 --- /dev/null +++ b/src/cpu/operators/CpuScale.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuScale.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/cpu/kernels/CpuScaleKernel.h" +#include "support/Rounding.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void precompute_dx_dy_offsets( + ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) +{ + ARM_COMPUTE_ERROR_ON(offsets == nullptr); + float sampling_offset = 0.0f; + if (sampling_policy == SamplingPolicy::CENTER) + { + sampling_offset = 0.5f; + } + + Window win; + win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); + win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); + + if (dx != nullptr && dy != nullptr) + { + // Pre-compute the offset and pixel's distance for BILINEAR interpolation + Iterator offsets_it(offsets, win); + Iterator dx_it(dx, win); + Iterator dy_it(dy, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; + const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; + const int in_xi = std::floor(in_x); + const int in_yi = std::floor(in_y); + + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; + *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; + }, + offsets_it, dx_it, dy_it); + } + else + { + // Pre-compute the offset for NEAREST interpolation + Iterator offsets_it(offsets, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast<size_t>( + align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) + : std::floor(float_in_xi)); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + }, + offsets_it); + } +} +} // namespace + +void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, dst, info); + + _scale_info = info; + _is_prepared = false; + + // Get data layout and width/height indices + _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; + + // Get the tensor shape + TensorShape shape(dst->dimension(idx_width)); + shape.set(1, dst->dimension(idx_height), false); + + TensorInfo tensor_info_offsets(shape, Format::S32); + TensorInfo tensor_info_dxdy(shape, Format::F32); + + auto dx = std::make_unique<TensorInfo>(tensor_info_dxdy); + auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy); + auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets); + auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>(); + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info); + break; + } + case InterpolationPolicy::BILINEAR: + { + scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info); + break; + } + case InterpolationPolicy::AREA: + { + scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + _kernel = std::move(scale_kernel); +} + +Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); + + ITensorInfo *offsets = nullptr; + ITensorInfo *dx = nullptr; + ITensorInfo *dy = nullptr; + + // Get data layout and width/height indices + const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; + + // Get the tensor shape of auxilary buffers + const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); + TensorInfo tensor_info_offsets(shape, Format::S32); + TensorInfo tensor_info_dx(shape, Format::F32); + TensorInfo tensor_info_dy(shape, Format::F32); + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + offsets = &tensor_info_offsets; + break; + case InterpolationPolicy::BILINEAR: + offsets = &tensor_info_offsets; + dx = &tensor_info_dx; + dy = &tensor_info_dy; + break; + default: + break; + } + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); + return Status{}; +} + +void CpuScale::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + _is_prepared = true; + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + auto dx = tensors.get_tensor(TensorType::ACL_INT_0); + auto dy = tensors.get_tensor(TensorType::ACL_INT_1); + auto offsets = tensors.get_tensor(TensorType::ACL_INT_2); + + // Get data layout and width/height indices + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; + const SamplingPolicy sampling_policy = _scale_info.sampling_policy; + + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); + + if (precompute_indices_weights) + { + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + // Pre-compute offsets for nearest interpolation + precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used); + break; + } + case InterpolationPolicy::BILINEAR: + { + // Pre-compute dx, dy and offsets for bilinear interpolation + precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used); + break; + } + case InterpolationPolicy::AREA: + { + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + } + else + { + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && + policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + { + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + } + } +} + +void CpuScale::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h new file mode 100644 index 0000000000..c12a8e733a --- /dev/null +++ b/src/cpu/operators/CpuScale.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SCALE_H +#define ARM_COMPUTE_CPU_SCALE_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to compute Scale */ +class CpuScale : public ICpuOperator +{ +public: + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo to be used for configuration + * + * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear + */ + void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuScale::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); + + // Inherited methods overridden: + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + +private: + ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + bool _is_prepared{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SCALE_H */ diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp new file mode 100644 index 0000000000..fecee7d765 --- /dev/null +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuSoftmax.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/SoftmaxHelpers.h" +#include "src/cpu/kernels/CpuSoftmaxKernel.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuSoftmaxGeneric::CpuSoftmaxGeneric() : _softmax_kernel(), _tmp(), _aux_mem(InternalTensorIdx::COUNT) +{ +} + +void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); + ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis); + + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + _axis = actual_axis; + + const ITensorInfo *tmp_input = src; + + TensorInfo tensor_info_tmp; + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Create intermediate tensors shapes + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + tensor_info_tmp = input_info.clone()->set_data_type(DataType::F32); + } + + // Init intermediate tensors + _tmp = TensorInfo(tensor_info_tmp); + + // Configure kernels + auto sm = std::make_unique<kernels::CpuSoftmaxKernel>(); + + // Softmax 2D case + sm->configure(tmp_input, dst, beta, is_log, actual_axis, &_tmp); + + _softmax_kernel = std::move(sm); + + if (_tmp.total_size() > 0) + { + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + } +} + +Status +CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || + static_cast<int32_t>(src->num_dimensions()) <= axis); + + // Create intermediate tensor info + TensorInfo tensor_info_tmp; + + if (is_data_type_quantized_asymmetric(src->data_type())) + { + tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true); + } + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuSoftmaxKernel::validate(src, dst, beta, actual_axis, is_log, &tensor_info_tmp)); + + return Status{}; +} + +void CpuSoftmaxGeneric::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true); + + ITensorPack softmax_pack; + + softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}}; + + if (_axis == 0) + { + NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); + } + else + { + NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimX, _softmax_kernel->window(), softmax_pack); + } +} + +experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h new file mode 100644 index 0000000000..6ba3476eff --- /dev/null +++ b/src/cpu/operators/CpuSoftmax.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H +#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuPermute.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +class CpuSoftmaxKernel; + +/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. + * + * Softmax is calculated by : + * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] + * + * Log Softmax is calculated by : + * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] + * + * This function runs the following function/kernels: + * -# If axis is not 0: + * -# @ref CpuPermute + * -# @ref kernels::CpuSoftmaxKernel + */ +class CpuSoftmaxGeneric : public ICpuOperator +{ +public: + CpuSoftmaxGeneric(); + /** Set the input and output tensors. + * + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * @param[in] is_log True if the operation is log-softmax + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuSoftmaxGeneric::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + TMP = 0, + PERMUTED_SRC, + PERMUTED_DST, + COUNT + }; + + std::unique_ptr<ICPPKernel> _softmax_kernel; + + TensorInfo _tmp; + + experimental::MemoryRequirements _aux_mem{}; + + unsigned int _axis = 0; +}; + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp new file mode 100644 index 0000000000..7d27efbc96 --- /dev/null +++ b/src/cpu/operators/CpuSub.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuSub.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuSubKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuSub::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy); + auto k = std::make_unique<kernels::CpuSubKernel>(); + k->configure(src0, src1, dst, policy); + _kernel = std::move(k); +} + +Status CpuSub::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuSubKernel::validate(src0, src1, dst, policy); +} + +void CpuSub::run(ITensorPack &tensors) +{ + const auto split_dimension = static_cast<kernels::CpuSubKernel *>(_kernel.get())->get_split_dimension(); + + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h new file mode 100644 index 0000000000..d1782a1d3c --- /dev/null +++ b/src/cpu/operators/CpuSub.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_H +#define ARM_COMPUTE_CPU_SUB_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuSubKernel */ +class CpuSub : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuSub::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_H */ diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp new file mode 100644 index 0000000000..ea548e0511 --- /dev/null +++ b/src/cpu/operators/CpuTranspose.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuTranspose.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuTransposeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuTransposeKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuTranspose.h b/src/cpu/operators/CpuTranspose.h new file mode 100644 index 0000000000..8934481ef6 --- /dev/null +++ b/src/cpu/operators/CpuTranspose.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H +#define ARM_COMPUTE_CPU_TRANSPOSE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuTransposeKernel */ +class CpuTranspose : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuTranspose::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */ diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp new file mode 100644 index 0000000000..7d81aee0e9 --- /dev/null +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuWinogradConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/assembly/winograd.hpp" +#include "src/core/NEON/kernels/convolution/common/tensor.hpp" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; + +namespace +{ +inline Tensor4DShape internal_get_shape(const ITensorInfo *in) +{ + const DataLayout data_layout = in->data_layout(); + const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); + + return Tensor4DShape{in_batches, in_height, in_width, in_channels}; +} + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(dst, weights); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd layer only supports unit strides."); + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + return Status{}; +} + +bool get_winograd_kernel_implementation(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + arm_conv::winograd::WinogradImpl *winograd_impl, + std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) +{ + arm_conv::winograd::WinogradConfig winograd_cfg; + arm_gemm::GemmConfig cfg; + + const DataType data_type = src->data_type(); + Tensor4DShape in_shape{internal_get_shape(src)}; + Tensor4DShape out_shape{internal_get_shape(dst)}; + Tensor4DShape kernel_shape{internal_get_shape(weights)}; + uint32_t nthreads = NEScheduler::get().num_threads(); + // Get configuration arguments for Winograd + winograd_cfg.output_rows = 0; + winograd_cfg.output_cols = 0; + conv_args = std::make_unique<arm_conv::ConvolutionArgs>( + in_shape.n_batches, + arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)}, + in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(), + arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)}, + out_shape.n_channels, + arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)}, + assembly_utils::map_to_arm_gemm_activation(act_info)); + + bool success = false; + if (data_type == DataType::F32) + { + success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); + } +#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) + else if (data_type == DataType::F16) + { + success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); + } +#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) + else + { + success = false; + } + return success; +} +inline bool fuse_function_supported(const ActivationLayerInfo &act_info) +{ + return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || + act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; +} +} // namespace + +CpuWinogradConv2d::CpuWinogradConv2d() + + : _gemm_function(std::make_unique<CpuGemm>()), + _activation_func(std::make_unique<CpuActivation>()), + _transform_input_kernel(nullptr), + _transform_output_kernel(nullptr), + _permute_input(std::make_unique<CpuPermute>()), + _permute_output(std::make_unique<CpuPermute>()), + _permute_weights(std::make_unique<CpuPermute>()), + _aux_mem(AuxTensorIdx::Count), + _conv_args{nullptr}, + _winograd_impl{}, + _data_layout(), + _winograd_transformed_input{}, + _winograd_transformed_output{}, + _winograd_transformed_weights{}, + _input_workspace(), + _output_workspace(), + _weights_hwio(), + _input_nhwc(), + _output_nhwc(), + _is_prepared{false}, + _run_activation{false} +{ +} + +CpuWinogradConv2d::~CpuWinogradConv2d() = default; + +void CpuWinogradConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); + ARM_COMPUTE_UNUSED(biases); + const DataType data_type = src->data_type(); + uint32_t nthreads = NEScheduler::get().num_threads(); + _data_layout = src->data_layout(); + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + + bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &_winograd_impl, _conv_args); + + ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + + const bool has_impl = ((_winograd_impl.input_transform != nullptr) && + (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); + if (has_impl) + { + // Determine how much working space is required, allocate it. + const size_t input_workspace_size = + _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); + const size_t output_workspace_size = + _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); + + TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); + TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); + _input_workspace = input_workspace_info; + _output_workspace = output_workspace_info; + + const auto &wds = _winograd_impl.winograd_spec; + + // Preparing winograd transformed input tensor + const size_t data_type_size = src->element_size(); + const uint32_t m = _winograd_impl.gemm_args->_Msize; // Total number of tiles + const uint32_t k = _winograd_impl.gemm_args->_Ksize; // Input channels + const uint32_t n = _winograd_impl.gemm_args->_Nsize; // Output channels + const uint32_t n_gemms = _winograd_impl.gemm_args->_nmulti; + const uint32_t n_batches = _winograd_impl.gemm_args->_nbatches; + constexpr size_t storage_alignment = 64; + + const TensorShape a_shape(k, m, n_batches, n_gemms); + Strides a_strides(data_type_size); + a_strides.set(1, data_type_size * _winograd_impl.winograd_spec.input_ld_row); + a_strides.set(2, data_type_size * _winograd_impl.winograd_spec.input_ld_batch); + a_strides.set(3, data_type_size * _winograd_impl.winograd_spec.input_ld_matrix); + + const TensorShape b_shape(n, k, n_gemms); + Strides b_strides(data_type_size); + b_strides.set(1, data_type_size * _winograd_impl.winograd_spec.weight_ld_row); + b_strides.set(2, data_type_size * _winograd_impl.winograd_spec.weight_ld_matrix); + + const TensorShape d_shape(n, m, n_batches, n_gemms); + Strides d_strides(data_type_size); + d_strides.set(1, data_type_size * _winograd_impl.winograd_spec.output_ld_row); + d_strides.set(2, data_type_size * _winograd_impl.winograd_spec.output_ld_batch); + d_strides.set(3, data_type_size * _winograd_impl.winograd_spec.output_ld_matrix); + + TensorInfo a_info{}; + TensorInfo b_info{}; + TensorInfo d_info{}; + a_info.init(a_shape, 1, data_type, a_strides, 0, wds.input_matrix_size_bytes); + b_info.init(b_shape, 1, data_type, b_strides, 0, wds.weight_matrix_size_bytes); + d_info.init(d_shape, 1, data_type, d_strides, 0, wds.output_matrix_size_bytes); + + _winograd_transformed_input = a_info; + _winograd_transformed_weights = b_info; + _winograd_transformed_output = d_info; + + PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); + + // Configure the kernel to transform the input tensor from NCHW -> NHWC + if (_data_layout == DataLayout::NCHW) + { + _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); + weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); + } + + // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] + _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); + + // Reorder the convoluted output to ACL's ordering NCHW + if (_data_layout == DataLayout::NCHW) + { + // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1, + dst->data_type()); + _output_nhwc = info; + _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); + } + + // Configure input transform kernel + _transform_input_kernel = + std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); + + // Configure GEMM function + _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, + &_winograd_transformed_output, 1.0f, 0.f); + + // Configure output transform kernel + _transform_output_kernel = + std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); + + //Configure Activation Layer + _run_activation = act_info.enabled() && !fuse_function_supported(act_info); + if (_run_activation) + { + _activation_func->configure(dst, nullptr, act_info); + } + + const auto mm_mem_req = _gemm_function->workspace(); + for (unsigned int slot = 0; slot < mm_mem_req.size(); ++slot) + { + _aux_mem[slot] = mm_mem_req[slot]; + } + + // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, + wds.input_matrix_size_bytes, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, + wds.output_matrix_size_bytes, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, + std::max(input_workspace_size, output_workspace_size)); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, + wds.weight_matrix_size_bytes, storage_alignment); + if (_data_layout == DataLayout::NCHW) + { + _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); + _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); + } + } +} +Status CpuWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); + + // Disable winograd for fp16 if fast math is false. + if (!enable_fast_math) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); + } + + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + arm_conv::winograd::WinogradImpl winograd_impl{}; + + std::unique_ptr<arm_conv::ConvolutionArgs> conv_args; + const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &winograd_impl, conv_args); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + return Status{}; +} + +void CpuWinogradConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto biases = tensors.get_const_tensor(ACL_SRC_2); + auto output = tensors.get_tensor(ACL_DST); + Window win; + + const uint32_t nthreads = NEScheduler::get().num_threads(); + + // The Winograd transform implementation does fine-grain threading inside the transforms. Just pass thread_id and nthreads. + win.set(Window::DimX, Window::Dimension(0, nthreads, 1)); + + // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory. + CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); + CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, + tensors, true); + CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); + const bool is_nchw = _data_layout == DataLayout::NCHW; + if (is_nchw) + { + //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC + ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}}; + _permute_input->run(pack); + } + + CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, + tensors, true); + CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); + CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); + + ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src}, + {ACL_DST, winograd_input_transformed.get()}, + {ACL_INT, input_workspace.get()}}; + NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack); + + CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, + tensors, true); + + // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC, winograd_input_transformed.get()); + gemm_pack.add_const_tensor(ACL_SRC_1, winograd_weights_transformed.get()); + gemm_pack.add_const_tensor(ACL_BIAS, nullptr); + gemm_pack.add_tensor(ACL_DST, winograd_output_transformed.get()); + _gemm_function->run(gemm_pack); + + // Output transform + ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()}, + {ACL_DST, is_nchw ? output_nhwc.get() : output}, + {ACL_SRC_1, biases}, + {ACL_INT, output_workspace.get()}}; + NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack); + if (is_nchw) + { + // Reorder the convoluted output to ACL's ordering NCHW + ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}}; + _permute_output->run(pack); + } + if (_run_activation) + { + ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}}; + _activation_func->run(pack); + } +} + +void CpuWinogradConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + + CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; + _permute_weights->run(permute_tensors); + const int element_size_in_bytes = permuted_weights.get()->info()->element_size(); + // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format. + const unsigned int height_idx = 3; // H in HWIO + const unsigned int width_idx = 2; // W in HWIO + const unsigned int channel_idx = 1; // I in HWIO + + const int permuted_weight_row_stride = + permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; + const int permuted_weight_col_stride = + permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; + const int permuted_weight_channel_stride = + permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; + + // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory. + ITensor *weights_transf = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); + CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf); + + const void *permuted_weights_ptr; + void *win_wght_transf_ptr; + + permuted_weights_ptr = reinterpret_cast<const void *>( + permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); + win_wght_transf_ptr = + reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); + + // Prepare Weights + _winograd_impl.weight_transform->execute( + *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride, + permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1 + ); + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get()); + _gemm_function->prepare(gemm_pack); + _is_prepared = 1; + } +} +experimental::MemoryRequirements CpuWinogradConv2d::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h new file mode 100644 index 0000000000..03bfc51a46 --- /dev/null +++ b/src/cpu/operators/CpuWinogradConv2d.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/assembly/gemm_common.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuWinogradConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuWinogradConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d); + /** Destructor */ + ~CpuWinogradConv2d(); + + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * + * @param[in] src Source tensor Info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor Info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * For supported kernel sizes, see @ref arm_compute::NEWinogradConvolutionLayer + * @param[in] biases Biases tensor Info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[out] dst Destination tensor Info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d + * + * Similar to CpuWinogradConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + /** Slot 0 - 6 reserved for CpuGemm */ + TransformedInput = 7, + TransformedOutput, + WorkspaceIO, + TransformedWeights, + PermutedWeights, + Count, + PermutedInput = TransformedOutput, + PermutedOutput = TransformedInput + }; + std::unique_ptr<CpuGemm> _gemm_function; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<ICPPKernel> _transform_input_kernel; + std::unique_ptr<ICPPKernel> _transform_output_kernel; + std::unique_ptr<CpuPermute> _permute_input; + std::unique_ptr<CpuPermute> _permute_output; + std::unique_ptr<CpuPermute> _permute_weights; + experimental::MemoryRequirements _aux_mem{Count}; + std::unique_ptr<arm_conv::ConvolutionArgs> + _conv_args; // Make it unique ptr because this type does not have a default constructor + arm_conv::winograd::WinogradImpl _winograd_impl; + DataLayout _data_layout; + TensorInfo _winograd_transformed_input; + TensorInfo _winograd_transformed_output; + TensorInfo _winograd_transformed_weights; + TensorInfo _input_workspace; + TensorInfo _output_workspace; + TensorInfo _weights_hwio; + TensorInfo _input_nhwc; + TensorInfo _output_nhwc; + bool _is_prepared; + bool _run_activation; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp new file mode 100644 index 0000000000..a4c856bb8f --- /dev/null +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2018-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" +#include "src/cpu/operators/CpuTranspose.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +/** Run pretranspose_B_array in parallel (1D static scheduling) + * + * @tparam TypeInput + * @tparam TypeOutput + * + * @param[in] gemm_asm GemmCommon kernel to run + * @param[in] dst Pretransposed B array + * @param[in] src B array to be pretransposed + * @param[in] src_ld Stride in y + * @param[in] src_multi_stride Stride in z ("multi") + * @param[in] num_threads Number of threads to run this method. Must be >= 1 + */ +template <typename TypeInput, typename TypeOutput> +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, + ITensor *dst, + const TypeInput *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads, + bool transpose) +{ + ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); + ARM_COMPUTE_ERROR_ON(num_threads == 0); + // The window size is also the total workload size + const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size(); + + std::vector<IScheduler::Workload> workloads(num_threads); + for (unsigned int t = 0; t < num_threads; ++t) + { + workloads[t] = [=](const ThreadInfo &info) + { + const unsigned int start = (info.thread_id * wsize) / num_threads; + const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; + + if (start < end) + { + gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, transpose, start, + end); + } + }; + } + NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array"); +} +} // namespace + +using namespace arm_compute::experimental; + +namespace +{ +struct free_delete +{ + void operator()(void *x) + { + free(x); + } +}; + +struct Params +{ + unsigned int M; + unsigned int N; + unsigned int K; + unsigned int batches; + unsigned int multis; + unsigned int sections; + bool indirect; +}; + +Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + Params p; + p.M = d->tensor_shape().y(); + p.K = a->tensor_shape().x(); + p.N = d->tensor_shape().x(); + p.batches = 1; + p.multis = 1; + p.sections = 1; + p.indirect = false; + + if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + { + p.indirect = true; + p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; + } + else + { + p.multis = b->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(2) / p.multis; + } + + // Update M in case of GEMM3D for output + if (info.depth_output_gemm3d != 0) + { + p.M = d->tensor_shape().y() * d->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(3) / p.multis; + } + + return p; +} + +IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) +{ + // Schedule assembly kernel + const int granule_threshold = 200; + IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); + if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + { + scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); + } + else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && + (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S8)) + { + //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && + (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + { + //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + + return scheduling_hint; +} + +/** Fallback in case ACL doesn't have a function */ +template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing> +class Fallback : public CpuGemmAssemblyDispatch::IFallback +{ +public: + /** Destructor */ + ~Fallback() = default; + + /** Initialise the functions's input and output. + * + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[in] c Input tensor containing the Matrix C. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] args Matrix multiplication information. + * @param[in] gemm_info GEMM meta-data + * @param[in] os Output stage meta-data. + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, + const OutputStage &os = {}); + + /** Set requantization shifts to be used + * + * @param[in] shifts Requantization shifts + * + * @return Pointer to the shift data + */ + /** Set requantization data to be used + * + * + * @param shifts Requantization shifts + * @param multipliers Requantization multipliers + * + * @return A tuple with the pointers to the shift and multiplier data respectively + */ + std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> + set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + bool is_configured() const override; + experimental::MemoryRequirements workspace() const override; + bool isVarWeightsKernel() const override + { + if (!_gemm_kernel_asm) + return false; + const arm_compute::WeightFormat wf = + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); + return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; + } + +private: + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + PrePretransposedB, /* Transposed B (rhs) before being passed to gemm or pretranspose_B_array */ + Pretranspose, + Count + }; + + /** Configure the indirect buffer + * + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] info GEMM meta-data + */ + void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); + /** Prepare the indirect buffer */ + void prepare_indirect_buffer(ITensorPack &tensors); + + /** Operator to transpose B before gemm or pretranspose_B_array*/ + std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr}; + /** Assembly Gemm kernel */ + std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr}; + /** Optimised Arm® Neon™ kernel */ + std::unique_ptr<INEKernel> _optimised_kernel{nullptr}; + /** Assembly GEMM workspace tensor info */ + TensorInfo _workspace_info{}; + /** Pre-pre-transposed B tensor info */ + TensorInfo _pre_pretransposed_b_info{}; + /** Pre-transpose tensor info */ + TensorInfo _pretranspose_info{}; + /** Prepared flag */ + bool _is_prepared{false}; + /** GEMM meta-data */ + AsmGemmInfo _gemm_info{}; + /** GEMM kernel description */ + arm_gemm::KernelDescription _kernel_info{}; + /** Per channel quantization shifts */ + std::vector<int32_t> _shifts{}; + std::vector<int32_t> right_shifts{}; + std::vector<int32_t> left_shifts{}; + /** Per channel quantization multipliers */ + std::vector<int32_t> _multipliers{}; + /** Indirect buffer */ + std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; + std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; + bool _run_pre_pretranspose_b{false}; + bool _B_pre_pretranspose_required{false}; +}; + +template <typename TypeInput, typename TypeOutput, class OutputStage> +std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> +Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, + const std::vector<int32_t> &multipliers) +{ + _multipliers = multipliers; + _shifts = shifts; + bool need_left = false; + for (const auto s : _shifts) + { + left_shifts.push_back(std::max(-s, int32_t(0))); + right_shifts.push_back(std::min(-s, int32_t(0))); + if (s < 0 && !need_left) + { + need_left = true; + } + } + return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data()); +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) +{ + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer()); + const int multis = 1; + const int batches = a->info()->tensor_shape().total_size_upper(3); + const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput); + const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput); + const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput); + + const size_t output_hw = _cp.output_height * _cp.output_width; + const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); + const size_t batch_stride = batch_size / sizeof(TypeInput); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInput); + + for (int64_t m = 0; m < multis; m++) + { + for (int64_t b = 0; b < batches; b++) + { + for (int64_t output_y = 0; output_y < _cp.output_height; output_y++) + { + for (int64_t output_x = 0; output_x < _cp.output_width; output_x++) + { + int64_t output_xy = (output_y * _cp.output_width) + output_x; + + for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + { + for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + { + int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; + int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; + int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; + int64_t input_xy = (input_y * _cp.input_width) + input_x; + + if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + { + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_pad.data(); + } + else + { + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); + } + } + } + } + } + } + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); + + float zeropad = 0.f; + if (is_data_type_quantized(a->data_type())) + { + zeropad = a->quantization_info().uniform().offset; + } + + const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]); + const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]); + const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]); + const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]); + const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]); + const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); + const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); + + _cp = {input_width, + input_height, + input_channels, + kernel_width, + kernel_height, + output_width, + output_height, + info.ps_info.stride().first, + info.ps_info.stride().second, + info.padding_top, + info.padding_left, + zeropad}; + + if (info.method == AsmConvMethod::Conv) + { + _gemm_kernel_asm->set_convolution_parameters(_cp); + } + + if (info.method == AsmConvMethod::Indirect) + { + const unsigned int multis = 1; + const unsigned int batches = a->tensor_shape().total_size_upper(3); + const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height; + const unsigned int output_hw = _cp.output_width * _cp.output_height; + + using TypeInputPtr = TypeInput *; + const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr); + const size_t batch_stride = batch_size / sizeof(TypeInputPtr); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInputPtr); + + _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>( + reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>( + reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); + + // Set indirect argument + int64_t pos = 0; + for (int64_t m = 0; m < multis; m++) + { + for (int64_t b = 0; b < batches; b++) + { + for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + { + (_indirect_arg.get())[pos++] = + _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + } + } + } + + _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, + const OutputStage &os) +{ + _is_b_constant = b->are_values_constant(); + _is_c_constant = c ? c->are_values_constant() : true; + + _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); + if (_gemm_kernel_asm == nullptr) + { + //configuration not supported: Leave function unconfigured: + return; + } + + arm_gemm::GemmConfig gemm_cfg = _gemm_kernel_asm->get_config(); + + // arm_compute wrapper for the Gemm object (see above) + auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>(); + ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); + acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + const unsigned int alignment = 4096; + _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); + _aux_mem[AsmGemmWorkspace] = + MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); + + //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and + //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 + { + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); + if (window_size < static_cast<unsigned int>(args._maxthreads)) + { + _gemm_kernel_asm->set_nthreads(window_size); + } + } + + _optimised_kernel = std::move(acl_gemm_wrapper); + _gemm_info = gemm_info; + + // Check if we need to pre-pretranspose B. Fixed format kernels need no pre-pretranspose. + _B_pre_pretranspose_required = _gemm_info.transpose_b && !isVarWeightsKernel(); + _B_pretranspose_required = _gemm_kernel_asm->B_pretranspose_required(); + + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + const bool kernel_can_fuse_transpose = _B_pretranspose_required && kernel_supports_transpose; + _run_pre_pretranspose_b = _B_pre_pretranspose_required && !kernel_can_fuse_transpose; + + if (_run_pre_pretranspose_b) + { + _pre_pretranspose_b = std::make_unique<CpuTranspose>(); + _pre_pretranspose_b->configure(b, &_pre_pretransposed_b_info); + MemoryLifetime lifetime; + if (_is_b_constant) + { + if (_B_pretranspose_required) + { + // PrePretransposedB tensor is only used in prepare(), but is then succeeded by Pretranspose + // So PrePretransposedB can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PrePretransposedB tensor is only used in prepare(), but is the final transformation of B + // So PrePretransposedB needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PrePretransposedB tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + _aux_mem[PrePretransposedB] = + MemoryInfo(offset_int_vec(PrePretransposedB), lifetime, _pre_pretransposed_b_info.total_size(), alignment); + } + + // Check for pre-transposed support + if (_B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); + _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); + _aux_mem[Pretranspose] = + MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); + } + + // Handle indirect GEMM convolution + if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + { + configure_indirect(a, b, d, gemm_info); + } + + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale * + b->quantization_info().uniform().scale); + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ARM_COMPUTE_ERROR_ON_NULLPTR(b); + + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. + if (c && c->info()->data_type() == DataType::S32) + { + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + } + const ITensor *b_to_use = b; + + // Pre-pretranspose B if required + CpuAuxTensorHandler pre_pretransposed_b( + offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors, + /*pack_inject: no need to inject into tensors*/ + false, + /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/ + !_run_pre_pretranspose_b); + + if (_run_pre_pretranspose_b) + { + ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr); + ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}}; + _pre_pretranspose_b->run(pre_pretranspose_pack); + b_to_use = pre_pretransposed_b.get(); + } + + // Pretranspose B if required + if (_B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + const auto in1_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); + + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + _gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); + + b->mark_as_unused(); + // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b, + // its memory will be auto-managed by the handler + } + + if (_gemm_info.method == AsmConvMethod::Indirect) + { + prepare_indirect_buffer(tensors); + } + + _is_prepared = true; + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const +{ + return _optimised_kernel != nullptr; +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const +{ + return _aux_mem; +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) +{ + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto d = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, d); + + // Only update at runtime if the src quantization is dynamic + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value && + (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic())) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale * + b->info()->quantization_info().uniform().scale); + } + + int lda = a->info()->strides_in_bytes().y() / a->info()->element_size(); + int ldb = 0; + const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size(); + + const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; + const size_t a_multi_idx = a_batch_idx + 1; + const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; + const size_t d_multi_idx = d_batch_idx + 1; + + int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / a->info()->element_size(); + const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / d->info()->element_size(); + + int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / a->info()->element_size(); + int multi_stride_b = 0; + const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); + + auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); + const TypeInput *in1_ptr = nullptr; + auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); + + const ITensor *b_to_use = b; + + // Pre-pretranspose B if required + CpuAuxTensorHandler pre_pretransposed_b( + offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors, + false /*pack_inject: no need to inject into tensors*/, + !_run_pre_pretranspose_b /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/); + if (b_to_use && !_is_b_constant && _run_pre_pretranspose_b) + { + ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr); + ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}}; + _pre_pretranspose_b->run(pre_pretranspose_pack); + b_to_use = pre_pretransposed_b.get(); + } + + // Check if B is pre-tranposed and de-reference if not + if (b_to_use && !_gemm_kernel_asm->B_is_pretransposed()) + { + ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + in1_ptr = + reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes()); + } + + // If necessary, run pretranspose every time if either weights or biases are non-constant + if ((b_to_use && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) + { + if (c && c->info()->data_type() == DataType::S32) + { + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + } + + // Pretranspose B if required + if (b_to_use && _B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + const auto b_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + + if (_is_b_constant) + { + _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); + } + else + { + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + _gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); + } + } + } + + const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); + + // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads + CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); + if (workspace.get()->buffer() != nullptr) + { + _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer())); + const unsigned int split_dim = scheduling_hint.split_dimension(); + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); + unsigned int num_threads = NEScheduler::get().num_threads(); + if (window_size < num_threads) + { + num_threads = window_size; + } + if (split_dim != IScheduler::split_dimensions_all) + { + // Make sure the kernel does not expect more threads than we can actually spawn + const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); + num_threads = std::min(num_iterations, num_threads); + } + _gemm_kernel_asm->set_nthreads(num_threads); + } + + // Prepare assembly kernel + prepare(tensors); + + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. + TypeOutput *bias = nullptr; + if (c && c->info()->data_type() != DataType::S32) + { + bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); + } + + if (_gemm_info.method == AsmConvMethod::Indirect) + { + in0_ptr = nullptr; + lda = 0; + batch_stride_a = 0; + multi_stride_a = 0; + } + + // Set gemm parameters + _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, + ldd, batch_stride_d, multi_stride_d, bias, 0); + // Schedule + NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); + fallback->configure(a, b, c, d, args, info); + arm_gemm = std::move(fallback); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(activation); + + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>(); + + // Configure requantization info + const GEMMLowpOutputStageInfo os_info = info.output_stage; + + arm_gemm::DequantizeFloat gemm_dequant_info{}; + gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale); + + fallback->configure(a, b, c, d, args, info, gemm_dequant_info); + arm_gemm = std::move(fallback); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(activation); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); + + // Configure requantization info + const int32_t negation = info.negated_offsets ? 1 : -1; + const int32_t a_offset = -a->quantization_info().uniform().offset * negation; + const int32_t b_offset = -b->quantization_info().uniform().offset * negation; + const GEMMLowpOutputStageInfo os_info = info.output_stage; + + arm_gemm::Requantize32 gemm_requant_info{}; + if (os_info.gemmlowp_shifts.size() > 1) + { + const auto requantize_data = + fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + } + else + { + gemm_requant_info = + arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift, + os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + } + + // Configure fallback + fallback->configure(a, b, c, d, args, info, gemm_requant_info); + arm_gemm = std::move(fallback); +} +} //namespace + +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr) +{ +} + +Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_UNUSED(c); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + // TODO: Incorporate info.transpose_b COMPMID-6595 + switch (a->data_type()) + { + case DataType::F32: + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F32 input"); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if (d->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8 input and U8 output"); + } + break; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + if (d->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8 input and S8 output"); + } + break; +#endif /* __aarch64__ */ + +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + { + if (d->data_type() == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and BFLOAT16 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and F32 output"); + } + break; + } +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + +#if defined(ENABLE_FP16_KERNELS) + case DataType::F16: + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F16 input and F16 output"); + break; +#endif /* ENABLE_FP16_KERNELS */ + default: + ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel"); + break; + } + expected_weight_format = assembly_utils::map_to_arm_compute_weight_format(arm_gemm_expected_wf); + + return Status{}; +} + +Status CpuGemmAssemblyDispatch::validate( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(c, info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), + "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); + +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); +#endif /* __aarch64__ */ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::BFLOAT16, DataType::F16, DataType::F32); + if (is_data_type_quantized_per_channel(b->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); + } + else if (is_fixed_format_fast_math(info.weight_format)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, + "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, + "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && + (d->data_type() != DataType::F32 && d->data_type() != DataType::BFLOAT16), + "Only F32/BFLOAT16 output supported for BFLOAT16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, + "Only U32 output supported for U8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, + "Only S32 output supported for S8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), + "Only QASYMM8/S32 output supported for QASYMM8 input"); + arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; + const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); + if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + { + // Correctness check: if the format expected by the kernel is + // not "any", make sure that the one found matches the format + // intended by the caller. + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (expected_weight_format != info.weight_format), + "The format expected by the kernel does not correspond with the one requested by the user."); + } + return ret; +} + +bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) +{ + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); + return act.type != arm_gemm::Activation::Type::None; +} + +void CpuGemmAssemblyDispatch::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); + + //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) + { + return; + } + + switch (a->data_type()) + { + case DataType::F32: + create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if (d->data_type() == DataType::S32) + { + create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info); + } + break; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + if (d->data_type() == DataType::S32) + { + create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); + } + else if (d->data_type() == DataType::F32) + { + create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info); + } + break; +#endif /* __aarch64__ */ +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + if (d->data_type() == DataType::BFLOAT16) + { + create_arm_gemm<bfloat16, bfloat16>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info); + } + break; +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ +#ifdef ENABLE_FP16_KERNELS + case DataType::F16: + create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); + break; +#endif /* ENABLE_FP16_KERNELS */ + default: + break; + } +} + +void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->prepare(tensors); +} + +bool CpuGemmAssemblyDispatch::is_configured() const +{ + return _arm_gemm && _arm_gemm->is_configured(); +} + +void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->run(tensors); +} + +experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + return _arm_gemm->workspace(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h new file mode 100644 index 0000000000..44c5c189a5 --- /dev/null +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H +#define ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/* Convolution method supported by the assembly gemm interface */ +enum class AsmConvMethod +{ + Im2Col, + Indirect, + Conv +}; + +struct AsmGemmInfo +{ + AsmConvMethod method{AsmConvMethod::Im2Col}; + PadStrideInfo ps_info{}; + ActivationLayerInfo activation_info{}; + GEMMLowpOutputStageInfo output_stage{}; + bool negated_offsets{true}; + bool reinterpret_input_as_3d{false}; + bool depth_output_gemm3d{false}; + int64_t padding_top{0}; + int64_t padding_left{0}; + float padding_value{0.f}; + bool fast_mode{false}; + bool fixed_format{false}; + arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; + bool reshape_b_only_on_first_run{true}; + bool accumulate{false}; + /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array + * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for + * by the reshape_b_only_on_first_run flag + * @note This flag will be silently ignored (assumed to be false) when the weight_format is a fixed format. Because + * fixed format kernels do not accept weights (B) with any prior transformations + */ + bool transpose_b{false}; +}; + +/** Assembly kernel glue */ +class CpuGemmAssemblyDispatch : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmAssemblyDispatch(); + /** Defautl destructor */ + ~CpuGemmAssemblyDispatch() = default; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch); + + class IFallback + { + public: + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual ~IFallback() = default; + }; + +public: + /** If supported create a Compute Library function else fallback to the arm_gemm function. + * + * @note Configuring "batches" + * The shapes of @p a @p b and @p d are arranged as follows: + * Lowest dimension <-> Highest dimension + * a: [K, M, Batch, Multi] + * b: [N, K, Multi] + * d: [N, M, Batch, Multi] + * + * The "Batch" refers to where "Batch" number of MxK slices of tensor a multiplies with a single KxN slice of b + * The "Multi" refers to where "Multi" number of individual multiplication of a with b + * + * E.g. the following are some example input shape configurations + * + * (1) Normal 2D gemm + * a: [K=3, M=4] + * b: [N=5, K=3] + * d: [N=5, M=4] + * + * (2) Batches of a sharing b (e.g. gemm-based batched convolution where b is the shared ) + * a: [K=3, M=4, Batch=9] + * b: [N=5, K=3] + * d: [N=5, M=4, Batch=9] + * + * (3) "Batches" of independent gemm (e.g. batched matmul) + * a: [K=3, M=4, Batch=1, Multi=7] + * b: [N=5, K=3, Multi=7] + * d: [N=5, M=4, Batch=1, Multi=7] + * + * (4) "Batches" of independent gemm where b is also shared + * a: [K=3, M=4, Batch=4, Multi=7] + * b: [N=5, K=3, Multi=7] + * d: [N=5, M=4, Batch=4, Multi=7] + * + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations + * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data + */ + void configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * @param[in] a Input tensor info (Matrix A) + * @param[in] b Input tensor info (Matrix B) + * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations + * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data + * + * @return a status. + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * This method has the same use of @ref + * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that + * the value of arm_compute::WeightFormat need to be passed via the + * parameter info. + * + * @return a status. + */ + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); + /** Checks if activation is supported by the gemm assembly dispatcher + * + * @param[in] activation Activation to check + * + * @return True if activation is supported else false + */ + static bool is_activation_supported(const ActivationLayerInfo &activation); + /** Was the function successfully configured ? + * + * @return True if the function is configured and ready to run + */ + bool is_configured() const; + /** Indicates if the convolution executes in variable weights mode. + * + * Similar to @ref CpuGemm::isVarWeightsKernel + */ + bool isVarWeightsKernel() const + { + return _arm_gemm && _arm_gemm->isVarWeightsKernel(); + } + + // Inherited methods overridden: + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */ +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h new file mode 100644 index 0000000000..3b980ce60b --- /dev/null +++ b/src/cpu/utils/CpuAuxTensorHandler.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H +#define ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/common/utils/Log.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Tensor handler to wrap and handle tensor allocations on workspace buffers + * + * @note Important: Despite the impression given by its name, the handler owns, rather than merely points to, the + * underlying tensor memory. + * + * @note About memory handling using bypass_* flags + * The bypass_alloc / bypass_import flags are meant to skip the expensive auxiliary tensor memory allocations or + * imports that are not needed during runtime, e.g. when the handler is not used at all in some branch of execution. + * + * If not handled correctly, these two flags can lead to performance issues (not bypass when needed to), or memory + * bugs (bypass when should not to). + * + * Make sure: + * + * 1. The aux tensor handlers must always be declared at the root level, or the same level as the run/prepare + * methods that potentially use them. + * + * Once the handler is destroyed (e.g. when going out of scope), the memory it owns (returned by the get() + * method) will also be destroyed. + * + * Thus it's important to ensure the handler is always in-scope when it is being used by a operator / kernel. + * + * 2. The handler's bypass_alloc and bypass_import flags should always be inverse of whether the handler is used in + * its surrounding scope by run/prepare. (This usually means being added to some tensor pack) + * + * This ensures we only bypass if and only if the aux tensor is not used by the op / kernel later. + * + * + * So the general usage pattern goes like this: + * + * bool use_aux_tensor = some_condition_about_when_to_use_the_aux_tensor + * + * CpuAuxTensorHandler aux_handler {..., !use_aux_tensor || bypass_alloc / bypass_import ||}; + * + * if (use_aux_tensor) + * { + * tensor_pack.add_tensor(aux_handler.get()); + * } + * op.run(tensor_pack); + */ +class CpuAuxTensorHandler +{ +public: + /** Create a temporary tensor handle, by either important an existing tensor from a tensor pack, or allocating a + * new one. + * + * @param[in] slot_id Slot id of the tensor to be retrieved in the tensor pack + * If no such tensor exists in the tensor pack, a new tensor will be allocated. + * @param[in] info Tensor info containing requested size of the new tensor. + * If requested size is larger than the tensor retrieved from the tensor pack, + * a new tensor will be allocated. + * @param[in,out] pack Tensor pack to retrieve the old tensor. When @p pack_inject is true, the new + * tensor will also be added here. + * @param[in] pack_inject In case of a newly allocated tensor, whether to add this tensor back to the + * @p pack + * @param[in] bypass_alloc Bypass allocation in case of a new tensor + * This is to prevent unnecessary memory operations when the handler object is not + * used + * @param[in] bypass_import Bypass importation in case of a retrieved tensor + * This is to prevent unnecessary memory operations when the handler object is not + * used + */ + CpuAuxTensorHandler(int slot_id, + TensorInfo &info, + ITensorPack &pack, + bool pack_inject = false, + bool bypass_alloc = false, + bool bypass_import = false) + : _tensor() + { + if (info.total_size() == 0) + { + return; + } + _tensor.allocator()->soft_init(info); + + ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id)); + if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + { + if (!bypass_alloc) + { + _tensor.allocator()->allocate(); + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); + } + + if (pack_inject) + { + pack.add_tensor(slot_id, &_tensor); + _injected_tensor_pack = &pack; + _injected_slot_id = slot_id; + } + } + else + { + if (!bypass_import) + { + _tensor.allocator()->import_memory(packed_tensor->buffer()); + } + } + } + + /** Create a temporary handle to the original tensor with a new @ref TensorInfo + * This is useful if we want to change a tensor's tensor info at run time without modifying the original tensor + * + * @param[in] info New tensor info to "assign" to @p tensor + * @param[in] tensor Tensor to be assigned a new @ref TensorInfo + * @param[in] bypass_import Bypass importing @p tensor's memory into the handler. + * This is to prevent unnecessary memory operations when the handler object is not used + */ + CpuAuxTensorHandler(TensorInfo &info, const ITensor &tensor, bool bypass_import = false) : _tensor() + { + _tensor.allocator()->soft_init(info); + if (!bypass_import) + { + ARM_COMPUTE_ERROR_ON(tensor.info() == nullptr); + if (info.total_size() <= tensor.info()->total_size()) + { + _tensor.allocator()->import_memory(tensor.buffer()); + } + } + } + + CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; + CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; + + ~CpuAuxTensorHandler() + { + if (_injected_tensor_pack) + { + _injected_tensor_pack->remove_tensor(_injected_slot_id); + } + } + + ITensor *get() + { + return &_tensor; + } + + ITensor *operator()() + { + return &_tensor; + } + +private: + Tensor _tensor{}; + ITensorPack *_injected_tensor_pack{nullptr}; + int _injected_slot_id{TensorType::ACL_UNKNOWN}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H |