diff options
Diffstat (limited to 'src/cpu')
364 files changed, 21336 insertions, 17300 deletions
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp index 7c14891ef8..b745af8229 100644 --- a/src/cpu/CpuContext.cpp +++ b/src/cpu/CpuContext.cpp @@ -24,6 +24,7 @@ #include "src/cpu/CpuContext.h" #include "arm_compute/core/CPP/CPPTypes.h" + #include "src/cpu/CpuQueue.h" #include "src/cpu/CpuTensor.h" @@ -32,7 +33,7 @@ #include <malloc.h> #if defined(_WIN64) -#define posix_memalign _aligned_realloc +#define posix_memalign _aligned_realloc #define posix_memalign_free _aligned_free #endif // defined(_WIN64) #endif // !defined(__APPLE__) && !defined(__OpenBSD__) @@ -66,7 +67,7 @@ void *default_aligned_allocate(void *user_data, size_t size, size_t alignment) size_t real_size = (rem) ? (size + alignment - rem) : size; ptr = memalign(alignment, real_size); #else /* defined(BARE_METAL) */ - if(posix_memalign(&ptr, alignment, size) != 0) + if (posix_memalign(&ptr, alignment, size) != 0) { // posix_memalign returns non-zero on failures, the return values will be // - EINVAL: wrong alignment @@ -81,17 +82,13 @@ void default_aligned_free(void *user_data, void *ptr) ARM_COMPUTE_UNUSED(user_data); free(ptr); } -static AclAllocator default_allocator = { &default_allocate, - &default_free, - &default_aligned_allocate, - &default_aligned_free, - nullptr - }; +static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate, + &default_aligned_free, nullptr}; AllocatorWrapper populate_allocator(AclAllocator *external_allocator) { bool is_valid = (external_allocator != nullptr); - if(is_valid) + if (is_valid) { is_valid = is_valid && (external_allocator->alloc != nullptr); is_valid = is_valid && (external_allocator->free != nullptr); @@ -123,14 +120,13 @@ cpuinfo::CpuIsaInfo populate_capabilities_flags(AclTargetCapabilities external_c return isa_caps; } -CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, - int32_t max_threads) +CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads) { CpuCapabilities caps; // Populate capabilities with system information caps.cpu_info = cpuinfo::CpuInfo::build(); - if(external_caps != AclCpuCapabilitiesAuto) + if (external_caps != AclCpuCapabilitiesAuto) { cpuinfo::CpuIsaInfo isa = populate_capabilities_flags(external_caps); auto cpus = caps.cpu_info.cpus(); @@ -151,11 +147,9 @@ CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, } // namespace CpuContext::CpuContext(const AclContextOptions *options) - : IContext(Target::Cpu), - _allocator(default_allocator), - _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1)) + : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1)) { - if(options != nullptr) + if (options != nullptr) { _allocator = populate_allocator(options->allocator); _caps = populate_capabilities(options->capabilities, options->max_compute_units); @@ -175,7 +169,7 @@ AllocatorWrapper &CpuContext::allocator() ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate) { CpuTensor *tensor = new CpuTensor(this, desc); - if(tensor != nullptr && allocate) + if (tensor != nullptr && allocate) { tensor->allocate(); } diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h index da241ed097..0c8ae49f49 100644 --- a/src/cpu/CpuContext.h +++ b/src/cpu/CpuContext.h @@ -25,8 +25,8 @@ #define SRC_CPU_CPUCONTEXT_H #include "src/common/AllocatorWrapper.h" -#include "src/common/IContext.h" #include "src/common/cpuinfo/CpuInfo.h" +#include "src/common/IContext.h" namespace arm_compute { @@ -36,7 +36,7 @@ namespace cpu struct CpuCapabilities { cpuinfo::CpuInfo cpu_info{}; - int32_t max_threads{ -1 }; + int32_t max_threads{-1}; }; /** CPU context implementation class */ @@ -60,9 +60,9 @@ public: AllocatorWrapper &allocator(); // Inherrited methods overridden - ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; - IQueue *create_queue(const AclQueueOptions *options) override; - std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src, + ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; + IQueue *create_queue(const AclQueueOptions *options) override; + std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) override; @@ -74,4 +74,4 @@ private: } // namespace cpu } // namespace arm_compute -#endif /* SRC_CPU_CPUCONTEXT_H */
\ No newline at end of file +#endif /* SRC_CPU_CPUCONTEXT_H */ diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp index 0f0097b3f4..be781d6794 100644 --- a/src/cpu/CpuQueue.cpp +++ b/src/cpu/CpuQueue.cpp @@ -29,8 +29,7 @@ namespace arm_compute { namespace cpu { -CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) - : IQueue(ctx) +CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx) { ARM_COMPUTE_UNUSED(options); } diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h index 871a36c85b..b6a2be0e23 100644 --- a/src/cpu/CpuQueue.h +++ b/src/cpu/CpuQueue.h @@ -24,10 +24,10 @@ #ifndef SRC_CPU_CPUQUEUE_H #define SRC_CPU_CPUQUEUE_H -#include "src/common/IQueue.h" - #include "arm_compute/runtime/IScheduler.h" +#include "src/common/IQueue.h" + namespace arm_compute { namespace cpu diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp index 6dd6d9c31b..59082b5350 100644 --- a/src/cpu/CpuTensor.cpp +++ b/src/cpu/CpuTensor.cpp @@ -29,8 +29,7 @@ namespace arm_compute { namespace cpu { -CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) - : ITensorV2(ctx), _legacy_tensor() +CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor() { ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu)); _legacy_tensor = std::make_unique<Tensor>(); @@ -41,7 +40,7 @@ void *CpuTensor::map() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!"); return nullptr; diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h index b078774c99..89931e1f94 100644 --- a/src/cpu/CpuTensor.h +++ b/src/cpu/CpuTensor.h @@ -24,10 +24,10 @@ #ifndef SRC_CPU_CPUTENSOR_H #define SRC_CPU_CPUTENSOR_H -#include "src/common/ITensorV2.h" - #include "arm_compute/runtime/Tensor.h" +#include "src/common/ITensorV2.h" + namespace arm_compute { namespace cpu @@ -52,7 +52,7 @@ public: void *map() override; StatusCode unmap() override; arm_compute::ITensor *tensor() const override; - StatusCode import(void *handle, ImportMemoryType type) override; + StatusCode import(void *handle, ImportMemoryType type) override; private: std::unique_ptr<Tensor> _legacy_tensor; @@ -60,4 +60,4 @@ private: } // namespace cpu } // namespace arm_compute -#endif /* SRC_CPU_CPUTENSOR_H */
\ No newline at end of file +#endif /* SRC_CPU_CPUTENSOR_H */ diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h index 0f7b9b6552..8726bc470a 100644 --- a/src/cpu/CpuTypes.h +++ b/src/cpu/CpuTypes.h @@ -31,6 +31,6 @@ namespace arm_compute typedef __fp16 float16_t; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC typedef float float32_t; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_CPUTYPES */ diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h index 8f4106240d..bcd0cb2c70 100644 --- a/src/cpu/ICpuKernel.h +++ b/src/cpu/ICpuKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_ICPUKERNEL_H #include "arm_compute/core/CPP/ICPPKernel.h" + #include "src/cpu/kernels/CpuKernelSelectionTypes.h" namespace arm_compute @@ -34,7 +35,7 @@ namespace cpu enum class KernelSelectionType { Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */ - Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ + Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */ }; template <class Derived> @@ -50,13 +51,15 @@ public: */ template <typename SelectorType> - static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported) + static const auto *get_implementation(const SelectorType &selector, + KernelSelectionType selection_type = KernelSelectionType::Supported) { - using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type; + using kernel_type = + typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type; - for(const auto &uk : Derived::get_available_kernels()) + for (const auto &uk : Derived::get_available_kernels()) { - if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr)) + if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr)) { return &uk; } diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp index f4bd4e6cad..50bf672d3c 100644 --- a/src/cpu/kernels/CpuActivationKernel.cpp +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -26,11 +26,11 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/activation/list.h" #include <array> @@ -43,126 +43,126 @@ namespace kernels { namespace { -static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = -{ +static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = { #ifdef ARM_COMPUTE_ENABLE_SVE - { - "sve2_q8_activation_lut", - [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut) - }, + {"sve2_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { + return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && + data.cpumodel == CPUModel::A510 && data.isa.sve2; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)}, #endif // ARM_COMPUTE_ENABLE_SVE #ifdef __aarch64__ - { - // Neon LUT implementantion takes precedence - "neon_q8_activation_lut", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut) - }, + {// Neon LUT implementantion takes precedence + "neon_q8_activation_lut", + [](const ActivationDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)}, #endif // __aarch64__ - { - "sve2_qu8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation) - }, - { - "sve2_qs8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation) - }, - { - "sve2_qs16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation) - }, - { - "sve_fp16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation) - }, - { - "sve_fp32_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, - REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation) - }, - { - "neon_fp16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation) - }, - { - "neon_fp32_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation) - }, - { - "neon_qu8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation) - }, - { - "neon_qs8_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation) - }, - { - "neon_qs16_activation", - [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation) - }, + {"sve2_qu8_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)}, + {"sve2_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)}, + {"sve2_qs16_activation", + [](const ActivationDataTypeISASelectorData &data) { + return data.dt == DataType::QSYMM16 && data.isa.sve2 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)}, + {"sve_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.f != ActivationLayerInfo::ActivationFunction::GELU; + }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)}, + {"sve_fp32_activation", + [](const ActivationDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)}, + {"neon_fp16_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)}, + {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)}, + {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)}, + {"neon_qs8_activation", + [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)}, + {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)}, }; /* Supported activation in the 8-bit integer domain */ -static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = -{ - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LEAKY_RELU, - ActivationLayerInfo::ActivationFunction::GELU, +static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, ActivationLayerInfo::ActivationFunction::GELU, }; /* Supported activation in the 16-bit integer domain */ -static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = -{ - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU -}; +static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = { + ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::QSYMM16, DataType::F16, DataType::F32); - const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() }); + const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)), - "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized_asymmetric(data_type) && + (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == + std::end(qasymm8_activations)), + "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && + (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), + f_act) == std::end(qsymm16_activations)), "For QSYMM16 only tanh and logistic are supported"); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) - && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, -128))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -176,7 +176,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, // Configure kernel window Window win = calculate_max_window(*src, Steps()); - if(dst != nullptr) + if (dst != nullptr) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -185,14 +185,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, return std::make_pair(Status{}, win); } #ifdef __aarch64__ -void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type, - const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out, - ActivationLayerInfo::LookupTable256 &lut, float a, float b) +void init_lut(ActivationLayerInfo::ActivationFunction act_func, + DataType data_type, + const UniformQuantizationInfo &qi_in, + const UniformQuantizationInfo &qi_out, + ActivationLayerInfo::LookupTable256 &lut, + float a, + float b) { - for(size_t i = 0; i < lut.size(); ++i) + for (size_t i = 0; i < lut.size(); ++i) { - float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in); - switch(act_func) + float tmp_f = + (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in); + switch (act_func) { case ActivationLayerInfo::ActivationFunction::HARD_SWISH: tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); @@ -246,7 +251,8 @@ void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_ty tmp_f = 0; break; } - lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out); + lut[i] = + (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out); } } #endif // __aarch64__ @@ -258,8 +264,9 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); - const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() }); - if(dst != nullptr) + const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ + src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()}); + if (dst != nullptr) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -271,11 +278,12 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac _name = std::string("CpuActivationKernel").append("/").append(uk->name); #ifdef __aarch64__ - if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) + if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED) { ActivationLayerInfo::LookupTable256 tmp_lut; - init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), - tmp_lut, activation_info.a(), activation_info.b()); + init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), + (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut, + activation_info.a(), activation_info.b()); activation_info.setLookupTable256(tmp_lut); } #endif // __aarch64__ @@ -288,11 +296,13 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac ICPPKernel::configure(win); } -Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status +CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); return Status{}; } @@ -302,7 +312,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -314,7 +324,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) { // Early exit on disabled activation - if(!_act_info.enabled()) + if (!_act_info.enabled()) { return; } diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h index 804407653f..4bad9fb3e8 100644 --- a/src/cpu/kernels/CpuActivationKernel.h +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -38,7 +39,8 @@ namespace kernels class CpuActivationKernel : public ICpuKernel<CpuActivationKernel> { private: - using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type; + using ActivationKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type; public: CpuActivationKernel() = default; @@ -71,7 +73,7 @@ public: size_t get_mws(const CPUInfo &platform, size_t thread_count) const override; // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Get the preferred dimension in which the scheduler splits the work into multiple jobs. @@ -94,8 +96,8 @@ public: private: ActivationLayerInfo _act_info{}; - ActivationKernelPtr _run_method{ nullptr }; - size_t _split_dimension{ Window::DimY }; + ActivationKernelPtr _run_method{nullptr}; + size_t _split_dimension{Window::DimY}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp index 2983575cb6..a990aa4715 100644 --- a/src/cpu/kernels/CpuAddKernel.cpp +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -26,19 +26,21 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/add/list.h" + #include <array> #if defined(ENABLE_FP32_KERNELS) namespace { - static constexpr size_t default_mws_N1_fp32_neon = 24536; - static constexpr size_t default_mws_V1_fp32_neon = 40510; -} +static constexpr size_t default_mws_N1_fp32_neon = 24536; +static constexpr size_t default_mws_V1_fp32_neon = 40510; +} // namespace #endif /* ENABLE_FP32_KERNELS */ namespace arm_compute @@ -49,152 +51,82 @@ namespace kernels { namespace { -static const std::vector<CpuAddKernel::AddKernel> available_kernels = -{ - { - "neon_qu8_add_fixedpoint", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; - }, - REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>) - }, - { - "neon_qs8_add_fixedpoint", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; - }, - REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>) - }, - { - "sve2_qu8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8) && data.isa.sve2; - }, - REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2) - }, - { - "sve2_qs8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; - }, - REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2) - }, - { - "sve2_qs16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::QSYMM16) && data.isa.sve2; - }, - REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2) - }, - { - "sve_fp32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F32) && data.isa.sve; - }, - REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve) - }, - { - "sve_fp16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; - }, - REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve) - }, - { - "sve_u8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::U8) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve) - }, - { - "sve_s16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S16) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve) - }, - { - "sve_s32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::S32) && data.isa.sve; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve) - }, - { - "neon_fp32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon) - }, - { - "neon_fp16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon) - }, - { - "neon_u8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon) - }, - { - "neon_s16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon) - }, - { - "neon_s32_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon) - }, - { - "neon_qu8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) - }, - { - "neon_qs8_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) - }, - { - "neon_qs16_add", - [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) - } -}; - -Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +static const std::vector<CpuAddKernel::AddKernel> available_kernels = { + {"neon_qu8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)}, + {"neon_qs8_add_fixedpoint", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; }, + REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)}, + {"sve2_qu8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)}, + {"sve2_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)}, + {"sve2_qs16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; }, + REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)}, + {"sve_fp32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)}, + {"sve_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) + { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)}, + {"sve_u8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)}, + {"sve_s16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)}, + {"sve_s32_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)}, + {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)}, + {"neon_fp16_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)}, + {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)}, + {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)}, + {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)}, + {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)}, + {"neon_qs8_add", + [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)}, + {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}}; + +Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) { ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) - || (src1.data_type() != dst.data_type())), - "Broadcasting across width is supported on configurations where all tensors have the same data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (src0.tensor_shape().x() != src1.tensor_shape().x()) && + ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) || + (src1.data_type() != dst.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), @@ -202,8 +134,8 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons } const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0.data_type(), - CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>( + CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -215,9 +147,9 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); - const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0->data_type(), - CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst); + const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>( + CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -237,7 +169,8 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I ICpuKernel::configure(win); } -Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +Status +CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); @@ -277,14 +210,14 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &add_fp32_neon) + if (this->_run_method == &add_fp32_neon) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } @@ -294,7 +227,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -307,7 +240,7 @@ size_t CpuAddKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return std::max(static_cast<size_t>(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h index 9921feabe2..4adba8bb16 100644 --- a/src/cpu/kernels/CpuAddKernel.h +++ b/src/cpu/kernels/CpuAddKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuAddKernel : public ICpuKernel<CpuAddKernel> { private: - using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + using AddKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; public: struct AddKernel @@ -74,10 +75,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Return minimum workload size of the relevant kernel @@ -98,9 +100,9 @@ public: private: ConvertPolicy _policy{}; - AddKernelPtr _run_method{ nullptr }; + AddKernelPtr _run_method{nullptr}; std::string _name{}; - size_t _split_dimension{ Window::DimY }; + size_t _split_dimension{Window::DimY}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp index b84bdd54e9..6a632e8702 100644 --- a/src/cpu/kernels/CpuAddMulAddKernel.cpp +++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp @@ -27,8 +27,8 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/addmuladd/list.h" @@ -41,36 +41,28 @@ namespace kernels { namespace { -static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = -{ +static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = { #ifdef __aarch64__ - { - "neon_fp32_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon) - }, - { - "neon_fp16_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); }, - REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon) - }, - { - "neon_qasymm8_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon) - }, - { - "neon_qasymm8_signed_add_mul_add", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon) - } + {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)}, + {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)}, + {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)}, + {"neon_qasymm8_signed_add_mul_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)} #endif // __aarch64__ }; -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); @@ -78,16 +70,16 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, using ActFunction = ActivationLayerInfo::ActivationFunction; const ActFunction act_func = act_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG( - (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY), - "Only RELU Family activations, or no activation, is supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && + act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY), + "Only RELU Family activations, or no activation, is supported"); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); - if(is_data_type_quantized(input1->data_type())) + if (is_data_type_quantized(input1->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32); @@ -101,39 +93,47 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add); ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], + "First dimensions of inputs and batchNorm coefs should match"); // Validate in case we have add layer's output (intermediate) initialized - if(add_output != nullptr && add_output->total_size() > 0) + if (add_output != nullptr && add_output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output); } // Validate in case final output has been initialized - if(final_output->total_size() > 0) + if (final_output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output); } - const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; } } // namespace -void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAddMulAddKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2); ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); - const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>( + DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); @@ -146,7 +146,7 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo set_shape_if_empty(*final_output, input1->tensor_shape()); set_data_type_if_unknown(*final_output, input1->data_type()); - if(add_output != nullptr) + if (add_output != nullptr) { set_shape_if_empty(*add_output, input1->tensor_shape()); set_data_type_if_unknown(*add_output, input1->data_type()); @@ -158,14 +158,19 @@ void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo ICpuKernel::configure(win); } -Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info)); return Status{}; } diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h index 67ce6f029a..c5e31ec291 100644 --- a/src/cpu/kernels/CpuAddMulAddKernel.h +++ b/src/cpu/kernels/CpuAddMulAddKernel.h @@ -26,6 +26,7 @@ #define SRC_CPU_KERNELS_CPUADDMULADDKERNEL #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,8 +40,15 @@ namespace kernels class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel> { private: - using AddMulAddKernelPtr = - std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, const ITensor *, ITensor *, ITensor *, ConvertPolicy, const ActivationLayerInfo &, const Window &)>::type; + using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + ITensor *, + ITensor *, + ConvertPolicy, + const ActivationLayerInfo &, + const Window &)>::type; public: struct AddMulAddKernel @@ -57,23 +65,31 @@ public: * Similar to @ref NEAddMulAdd::configure() * */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuAddMulAddKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector<AddMulAddKernel> &get_available_kernels(); @@ -81,7 +97,7 @@ public: private: ConvertPolicy _policy{}; ActivationLayerInfo _act_info{}; - AddMulAddKernelPtr _run_method{ nullptr }; + AddMulAddKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp index 764a1ec71c..05c7742b03 100644 --- a/src/cpu/kernels/CpuCastKernel.cpp +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -28,16 +28,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/common/Registrars.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "support/SaturateCast.h" - #include "src/cpu/kernels/cast/list.h" +#include "support/SaturateCast.h" namespace arm_compute { @@ -47,38 +47,30 @@ namespace kernels { namespace { -static const std::vector<CpuCastKernel::CastKernel> available_kernels = -{ - { - "neon_qs8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast) - }, - { - "neon_qu8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_u8_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast) - }, - { - "neon_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast) - }, - { - "neon_fp32_to_fp16_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast) - }, - { - "neon_s32_cast", - [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast) - }, +static const std::vector<CpuCastKernel::CastKernel> available_kernels = { + {"neon_qs8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)}, + {"neon_qu8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_u8_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)}, + {"neon_fp16_cast", + [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)}, + {"neon_fp32_to_fp16_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)}, + {"neon_s32_cast", + [](const CastDataTypeISASelectorData &data) + { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)}, }; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) @@ -88,57 +80,67 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); #ifdef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32, DataType::S64); #else // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, - DataType::S16, DataType::U16, DataType::F16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); #endif // __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 - && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 && + dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && + (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32), "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && + (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), "Only data_types supported [in] U16 -> [out] U8, U32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), "Only data_types supported [in] S16 -> [out] U8, S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 - && dst->data_type() != DataType::F16 - && dst->data_type() != DataType::F32 - && dst->data_type() != DataType::U8 - && dst->data_type() != DataType::S64), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && + (dst->data_type() != DataType::QASYMM8_SIGNED && + dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 && + dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 && + dst->data_type() != DataType::S64), "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64"); #ifdef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32, @@ -149,7 +151,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver #endif // __aarch64__ // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -193,15 +195,8 @@ inline void internal_neon_convert(const T1 *src_ptr, T2 *dst_ptr) template <> inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr), - vld1q_s32(src_ptr + 4), - vld1q_s32(src_ptr + 8), - vld1q_s32(src_ptr + 12) - } - }; + const int32x4x4_t texels = { + {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}}; vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0]))); vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0]))); vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1]))); @@ -215,33 +210,14 @@ inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int6 template <> inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), - vcvtq_f64_s64(vld1q_s64(src_ptr + 14)) - } - }; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)), + vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}}; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); vst1q_f32(dst_ptr + 8, texels.val[2]); @@ -251,34 +227,15 @@ inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float template <> inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr) { - const float64x2x4_t texels0 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 6)) - } - }; - const float64x2x4_t texels1 = - { - { - vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), - vcvtq_f64_u64(vld1q_u64(src_ptr + 14)) - } - }; + const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}}; + const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)), + vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}}; - const float32x4x4_t texels = - { - { - vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), - vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), - vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), - vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3])) - } - }; + const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])), + vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])), + vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])), + vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}}; vst1q_f32(dst_ptr, texels.val[0]); vst1q_f32(dst_ptr + 4, texels.val[1]); @@ -287,23 +244,26 @@ inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, floa } template <typename T1, typename T2> -inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) +inline void +convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x)); - } - }, - src, dst); + const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x); + } + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x)); + } + }, + src, dst); } } // namespace #endif // __aarch64__ @@ -325,21 +285,22 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */ - const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuCastKernel::get_implementation( + CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()}); - switch(_src->info()->data_type()) + switch (_src->info()->data_type()) { #ifdef __aarch64__ case DataType::U64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -353,7 +314,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S64: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F32: { @@ -369,111 +330,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8_SIGNED: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion QASYMM8_SIGNED -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion QASYMM8_SIGNED -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion QASYMM8_SIGNED -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = { + {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -492,111 +444,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::QASYMM8: case DataType::U8: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::S16: { /* Up-conversion U8 -> S16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s16(dst_ptr + x, texels.val[0]); - vst1q_s16(dst_ptr + x + 8, texels.val[1]); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion U8 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); - vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); - vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); - } + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion U8 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } - }, - src, dst); + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F16: @@ -609,35 +552,32 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { /* Up-conversion U8 -> U16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); - const uint16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_u8(vget_low_u8(texels_u8)), - vmovl_u8(vget_high_u8(texels_u8)) - } - }; + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - vst1q_u16(dst_ptr + x, texels.val[0]); - vst1q_u16(dst_ptr + x + 8, texels.val[1]); - } + const uint16x8x2_t texels = { + {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u16(dst_ptr + x, texels.val[0]); + vst1q_u16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -647,177 +587,154 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } case DataType::S16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion S16 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); - } + vst1q_s8(dst_ptr + x, + vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U8: { /* Down-conversion S16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; - - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), - vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); - } + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::S32: { /* Up-conversion S16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8x2_t texels = - { - { - vld1q_s16(src_ptr + x), - vld1q_s16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - const int32x4x4_t texels_s32 = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s16(vget_low_s16(texels.val[0])), - vmovl_s16(vget_high_s16(texels.val[0])), - vmovl_s16(vget_low_s16(texels.val[1])), - vmovl_s16(vget_high_s16(texels.val[1])) - } - }; + const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}}; - vst1q_s32(dst_ptr + x, texels_s32.val[0]); - vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); - vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); - vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); - } + const int32x4x4_t texels_s32 = { + {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])), + vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_s32(dst_ptr + x, texels_s32.val[0]); + vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); + vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); + vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -828,104 +745,92 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U16: { - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::U8: { /* Down-conversion U16 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = - { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, + vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } case DataType::U32: { /* Up-conversion U16 -> U32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_u16(src_ptr + x), - vld1q_u16(src_ptr + x + 8) - } - }; - - vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); - vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); - vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); - } + const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}}; - }, - src, dst); + vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); + vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -941,7 +846,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr break; } case DataType::F32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::F16: { @@ -953,105 +858,110 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::S32: { /* Conversion F32 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; + }}; - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); - } + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F32 -> U8 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); - } + }}; + + vst1_u8(dst_ptr + x, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), + vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), + vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion F32 -> QASYMM8_SIGNED */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float32x4x4_t texels = {{ vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12), - } - }; - - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + }}; + + vst1_s8(dst_ptr + x, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), + vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_s8(dst_ptr + x + 8, + vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), + vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } @@ -1060,7 +970,7 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr } break; case DataType::S32: - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { #if __aarch64__ case DataType::S64: @@ -1079,104 +989,102 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::F32: { /* Conversion S32 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; + }}; - vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); - vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); - vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); - vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); - } + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8_SIGNED: { /* Down-conversion S32 -> QASYMM8_SIGNED */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x4_t texels = {{ vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), vld1q_s32(src_ptr + x + 8), vld1q_s32(src_ptr + x + 12), - } - }; - vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3])))); - } + }}; + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), + vqmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), + vqmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1])))); - vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3])))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), + vmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), + vmovn_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } @@ -1184,68 +1092,66 @@ void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const Thr case DataType::U8: { /* Down-conversion S32 -> U8 */ - if(ConvertPolicy::SATURATE == _policy) + if (ConvertPolicy::SATURATE == _policy) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; - vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); - vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); - } + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), + vqmovun_s32(texels.val[1])))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), + vqmovun_s32(texels.val[3])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int32x4x4_t texels = - { - { - vld1q_s32(src_ptr + x), - vld1q_s32(src_ptr + x + 4), - vld1q_s32(src_ptr + x + 8), - vld1q_s32(src_ptr + x + 12) - } - }; + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); - vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12)}}; + + vst1_u8(dst_ptr + x, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, + vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), + vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); } break; } diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h index a7e6417ff2..ddbfe1f034 100644 --- a/src/cpu/kernels/CpuCastKernel.h +++ b/src/cpu/kernels/CpuCastKernel.h @@ -40,7 +40,8 @@ namespace kernels class CpuCastKernel : public ICpuKernel<CpuCastKernel> { private: - using CastKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type; + using CastKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type; public: CpuCastKernel() = default; @@ -76,7 +77,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct CastKernel @@ -89,7 +90,7 @@ public: static const std::vector<CastKernel> &get_available_kernels(); private: - ConvertPolicy _policy{ ConvertPolicy::SATURATE }; + ConvertPolicy _policy{ConvertPolicy::SATURATE}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp index bf5a44d78b..a52a1f58ea 100644 --- a/src/cpu/kernels/CpuCol2ImKernel.cpp +++ b/src/cpu/kernels/CpuCol2ImKernel.cpp @@ -29,8 +29,9 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -49,9 +50,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); // Validate configured output - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_col2im_shape(*src, convolved_dims, false)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -106,13 +108,16 @@ void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const T Iterator in(src, window); Iterator out(dst, window_out); - execute_window_loop(window, [&](const Coordinates & id) - { - const int hidx = id.y(); - const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x; - std::memcpy(out.ptr() + idx, in.ptr(), el_size); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int hidx = id.y(); + const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + + (hidx % _convolved_dims.width) * output_stride_x; + std::memcpy(out.ptr() + idx, in.ptr(), el_size); + }, + in, out); } const char *CpuCol2ImKernel::name() const @@ -121,4 +126,4 @@ const char *CpuCol2ImKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h index deafcc14df..3e394ac914 100644 --- a/src/cpu/kernels/CpuCol2ImKernel.h +++ b/src/cpu/kernels/CpuCol2ImKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_COL2IM_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -75,7 +76,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp index 29d40f0e52..8c290173e8 100644 --- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp @@ -30,10 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -50,13 +51,14 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); // Offset dst - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + batch_offset * dst->info()->strides_in_bytes()[3]; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); const int window_step_x = 16 / dst->info()->element_size(); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1)); @@ -66,66 +68,74 @@ void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, c const DataType dt = src->info()->data_type(); const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } @@ -154,7 +164,7 @@ void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int b _func = nullptr; _batch_offset = batch_offset; - switch(src->data_type()) + switch (src->data_type()) { case DataType::S8: case DataType::U8: @@ -196,9 +206,7 @@ void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _batch_offset, + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset, window); } diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h index 0de68a5d64..52ea553a7d 100644 --- a/src/cpu/kernels/CpuConcatenateBatchKernel.h +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h @@ -57,15 +57,15 @@ public: static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); private: - BatchConcatFunction *_func{ nullptr }; - unsigned int _batch_offset{ 0 }; + BatchConcatFunction *_func{nullptr}; + unsigned int _batch_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp index ebc5322aee..c75e1e4477 100644 --- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp @@ -30,11 +30,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include <cstdint> @@ -53,13 +54,14 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); // Offset destination - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + depth_offset * dst->info()->strides_in_bytes()[2]; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); const int window_step_x = 16 / dst->info()->element_size(); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1)); @@ -69,64 +71,73 @@ void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, c const DataType dt = src->info()->data_type(); const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, + vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); - const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } @@ -134,7 +145,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, c { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX)); @@ -154,7 +166,7 @@ void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int d _func = nullptr; _depth_offset = depth_offset; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &depth_concat<uint8_t>; @@ -192,9 +204,7 @@ void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), - tensors.get_tensor(TensorType::ACL_DST), - _depth_offset, + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset, window); } diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h index 5a0edb95bb..54de9aff46 100644 --- a/src/cpu/kernels/CpuConcatenateDepthKernel.h +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h @@ -65,15 +65,15 @@ public: static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); private: - DepthConcatFunction *_func{ nullptr }; - unsigned int _depth_offset{ 0 }; + DepthConcatFunction *_func{nullptr}; + unsigned int _depth_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp index 47a2b44443..b6c11d948b 100644 --- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp @@ -30,10 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <cstdint> @@ -53,7 +54,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -91,13 +92,14 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind auto dst = tensors.get_tensor(TensorType::ACL_DST); // Offset destination pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); const int window_step_x = 16; - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1)); @@ -108,64 +110,74 @@ void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &wind const DataType dt = src->info()->data_type(); const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h index 74d5d0c2c3..df880c4878 100644 --- a/src/cpu/kernels/CpuConcatenateHeightKernel.h +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h @@ -58,11 +58,11 @@ public: static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _height_offset{ 0 }; + unsigned int _height_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp index f00b37a01b..f6100cccca 100644 --- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp @@ -24,12 +24,12 @@ #include "src/cpu/kernels/CpuConcatenateWidthKernel.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Steps.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Steps.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/helpers/WindowHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" namespace arm_compute { @@ -47,7 +47,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -86,13 +86,14 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo auto dst = tensors.get_tensor(TensorType::ACL_DST); // Offset output pointer to the correct position - uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0]; + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + + _width_offset * dst->info()->strides_in_bytes()[0]; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); constexpr int window_step_x = 16; - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); // Create iterators @@ -101,62 +102,73 @@ void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &windo const DataType dt = src->info()->data_type(); const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); - if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, + vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } - else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) { - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), - vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); - } - }, - src_it, dst_it); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8( + reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), + dst_qinfo)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = + quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); } else { - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = src_it.ptr(); - const auto out_ptr = dst_ptr + dst_it.offset(); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - *(out_ptr + x) = *(in_ptr + x); - } - }, - src_it, dst_it); + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h index 418bc51b33..560e44e35a 100644 --- a/src/cpu/kernels/CpuConcatenateWidthKernel.h +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h @@ -58,11 +58,11 @@ public: static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _width_offset{ 0 }; + unsigned int _width_offset{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp index 08b39deef2..87703ec631 100644 --- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -34,8 +35,10 @@ namespace cpu { namespace kernels { -void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) +void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -43,7 +46,8 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT // Output tensor auto initialisation if not yet initialized auto_init_if_empty(*dst, *src->clone()); - ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; @@ -62,8 +66,10 @@ void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, IT ICpuKernel::configure(win); } -Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, - DataLayout data_layout) +Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); @@ -72,7 +78,7 @@ Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, c ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); // Checks performed when dst is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -97,11 +103,15 @@ void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const W Iterator input(src, window); Iterator output(dst, window); - execute_window_loop(window, [&](const Coordinates & id) - { - memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size); - }, - input); + execute_window_loop( + window, + [&](const Coordinates &id) + { + memcpy(output.ptr() + id.x() * dst_stride_x + + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, + input.ptr(), element_size); + }, + input); } const char *CpuConvertFullyConnectedWeightsKernel::name() const diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h index 9a1393323b..2253889e69 100644 --- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -53,24 +53,32 @@ public: * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_input_shape, + DataLayout data_layout); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ - unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ + unsigned int _factor1{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ + unsigned int _factor2{ + 0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp index 1005d001ab..745b1566c2 100644 --- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -47,7 +48,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); // Validate output if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); @@ -60,11 +61,11 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, { // Output auto inizialitation if not yet initialized { - const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; - const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); const int offset_correction = is_input_signed ? -128 : 128; - const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); } @@ -110,27 +111,29 @@ void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Win const uint8_t mask = 128; const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x)); - *(output_ptr + x) = in ^ mask; - } - }, - input, output); + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); } const char *CpuConvertQuantizedSignednessKernel::name() const diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h index b5eaf65487..e94d3d5ef2 100644 --- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp index 3f0f3fe422..1b693d7a3a 100644 --- a/src/cpu/kernels/CpuCopyKernel.cpp +++ b/src/cpu/kernels/CpuCopyKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,9 +49,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4); // Validate destination if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -64,7 +66,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, return std::make_pair(Status{}, calculate_max_window(*dst)); } -std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +std::pair<Status, Window> +validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) { const TensorShape src_shape = src->tensor_shape(); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding); @@ -84,7 +87,7 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa _padding = padding; std::pair<Status, Window> win_config; - if(padding.empty()) + if (padding.empty()) { win_config = validate_and_configure_window(src, dst); } @@ -97,17 +100,20 @@ void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pa ICpuKernel::configure(win_config.second); } -Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding) +Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, + const arm_compute::ITensorInfo *dst, + const PaddingList &padding) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding)); - if(padding.empty()) + if (padding.empty()) { ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); } else { - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); } return Status{}; @@ -122,38 +128,41 @@ void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const Thr const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_padding.empty()) + if (_padding.empty()) { - Window dst_window{ window }; - dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); + Window dst_window{window}; + dst_window.set(Window::DimX, + Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); Window out_slice = dst_window.first_slice_window_1D(); do { Iterator src_it(src, out_slice); Iterator dst_it(dst, out_slice); - execute_window_loop(out_slice, [&](const Coordinates &) - { - memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); - }, - src_it, dst_it); - } - while(dst_window.slide_window_slice_1D(out_slice)); + execute_window_loop( + out_slice, + [&](const Coordinates &) + { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); }, + src_it, dst_it); + } while (dst_window.slide_window_slice_1D(out_slice)); } else { - Window src_window{ window }; - src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); + Window src_window{window}; + src_window.set(Window::DimX, + Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); Iterator src_it(src, src_window); Iterator dst_it(dst, window); const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size(); - execute_window_loop(window, [&](const Coordinates &) - { - auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); - std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); - }, - src_it, dst_it); + execute_window_loop( + window, + [&](const Coordinates &) + { + auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); + std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); + }, + src_it, dst_it); } } diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h index c9ef8eba76..a05053f07e 100644 --- a/src/cpu/kernels/CpuCopyKernel.h +++ b/src/cpu/kernels/CpuCopyKernel.h @@ -55,7 +55,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp index d6c56d2012..82e3a5ce00 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/traits.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" #include "src/cpu/kernels/depthwiseconv2d/list.h" namespace arm_compute @@ -41,72 +42,53 @@ namespace kernels { namespace { -static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = -{ - { - "neon_qu8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QASYMM8); - }, - REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative) - }, - { - "neon_qs8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QASYMM8_SIGNED); - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative) - }, - { - "neon_fp16_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::F16 && data.isa.fp16); - }, - REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative) - }, - { - "neon_fp32_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative) - }, - { - "neon_qp8_qu8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); - }, - REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative) - }, - { - "neon_qp8_qs8_deptwiseconv2dnative", - [](const DepthwiseConv2dNativeDataTypeISASelectorData & data) - { - return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative) - }, +static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = { + {"neon_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)}, + {"neon_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)}, + {"neon_fp16_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)}, + {"neon_fp32_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)}, + {"neon_qp8_qu8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)}, + {"neon_qp8_qs8_deptwiseconv2dnative", + [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) + { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > + src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > + src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1)); - ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1)); + ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || + (info.pad_stride_info.stride().second < 1)); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); @@ -116,12 +98,12 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -131,9 +113,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + const TensorShape output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -142,7 +125,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } // namespace -void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info)); @@ -151,18 +138,26 @@ void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITe _conv_info = info; const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation( - DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() }); + DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _func = uk->ukernel; const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info())); + auto_init_if_empty(*dst, src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(output_shape) + .set_quantization_info(dst->quantization_info())); Window win = calculate_max_window(*dst, Steps()); ICpuKernel::configure(win); } -Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info)); return Status{}; @@ -187,7 +182,8 @@ const char *CpuDepthwiseConv2dNativeKernel::name() const return "CpuDepthwiseConv2dNativeKernel"; } -const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &CpuDepthwiseConv2dNativeKernel::get_available_kernels() +const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> & +CpuDepthwiseConv2dNativeKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h index 9fabd0b01c..7e78f52e13 100644 --- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/function_info/ConvolutionInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" #include "support/AclRequires.h" @@ -44,8 +45,9 @@ namespace kernels class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel> { private: - using DepthwiseConv2dNativeKernelPtr = - std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::type; + using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>:: + type; public: CpuDepthwiseConv2dNativeKernel() = default; @@ -64,17 +66,25 @@ public: * @param[in] info Depthwise convolution meta-data. * */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DepthwiseConv2dNativeKernel { @@ -89,9 +99,9 @@ private: * * @param[in] window Region on which to execute the kernel. */ - DepthwiseConv2dNativeKernelPtr _func{ nullptr }; + DepthwiseConv2dNativeKernelPtr _func{nullptr}; ConvolutionInfo _conv_info{}; - bool _has_biases{ false }; + bool _has_biases{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp index a2d24f9243..d17128b5ac 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.cpp +++ b/src/cpu/kernels/CpuDequantizeKernel.cpp @@ -28,12 +28,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> @@ -48,9 +49,11 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, + DataType::QSYMM16); - if(dst->tensor_shape().total_size() > 0) + if (dst->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); @@ -124,28 +127,30 @@ void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Win Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale, offset); + const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); - store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale, offset); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto val = *(in_ptr + x); - *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); - } - }, - in, out); + store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + auto val = *(in_ptr + x); + *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); + } + }, + in, out); } template <typename T> @@ -165,28 +170,30 @@ void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *o Iterator in(input, win); Iterator out(output, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale[id.z()]); + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale[id.z()]); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); - } - }, - in, out); + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); + } + }, + in, out); } template <typename T> @@ -206,37 +213,34 @@ void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *o Iterator in(input, win); Iterator out(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t vscale = + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], - scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7], - scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11], - scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15] - } - }; - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, vscale); - - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); - } - }, - in, out); + const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4], + scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9], + scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13], + scale[x + 14], scale[x + 15]}}; + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, vscale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); + } + }, + in, out); } template <typename T> @@ -257,28 +261,30 @@ void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Wind Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, scale); + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); - } - }, - in, out); + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); + } + }, + in, out); } template <typename T> @@ -299,34 +305,36 @@ void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Win Iterator in(input, win_collapsed); Iterator out(output, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize_int16(vin, scale); + const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); - } + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize_int16(vin, scale); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int16_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); - } - }, - in, out); + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int16_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); + } + }, + in, out); } template <typename T> void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) { - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: run_dequantization_qasymm8<T, uint8_t>(input, output, window); @@ -335,7 +343,9 @@ void run_dequantization_core(const ITensor *input, ITensor *output, const Window run_dequantization_qasymm8<T, int8_t>(input, output, window); break; case DataType::QSYMM8_PER_CHANNEL: - input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); + input->info()->data_layout() == DataLayout::NHWC + ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) + : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); break; case DataType::QSYMM8: run_dequantization_qsymm8<T>(input, output, window); @@ -377,7 +387,7 @@ void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, con const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(dst->info()->data_type()) + switch (dst->info()->data_type()) { case DataType::F32: run_dequantization_core<float>(src, dst, window); diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h index cfa991dc74..6ed58587c9 100644 --- a/src/cpu/kernels/CpuDequantizeKernel.h +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp index a4cdddee5e..4cb0fb1c40 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp @@ -22,13 +22,14 @@ * SOFTWARE. */ #include "src/cpu/kernels/CpuDirectConv2dKernel.h" -#include "src/cpu/kernels/directconv2d/list.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/directconv2d/list.h" using namespace arm_compute::detail; @@ -38,26 +39,25 @@ namespace cpu { namespace kernels { -static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = -{ - { - "neon_fp32_nhwc_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, - REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d) - }, - { - "neon_fp32_nchw_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, - REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d) - }, - { - "neon_fp16_nchw_directconv2d", - [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d) - }, +static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = { + {"neon_fp32_nhwc_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)}, + {"neon_fp32_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; }, + REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)}, + {"neon_fp16_nchw_directconv2d", + [](const DataTypeDataLayoutISASelectorData &data) + { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); @@ -76,7 +76,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); ARM_COMPUTE_UNUSED(width_idx); // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); @@ -100,11 +100,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso // Configure window without any padding win = calculate_max_window(*dst, Steps()); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) +void CpuDirectConv2dKernel::configure(ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -129,12 +133,13 @@ void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, IT ICpuKernel::configure(win_config.second); } -Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), - dst->clone().get()) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); return Status{}; } @@ -149,7 +154,8 @@ void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, c auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() }); + const auto *uk = CpuDirectConv2dKernel::get_implementation( + DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); uk->ukernel(window, src, weights, dst, _conv_info); diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h index b9265dc630..ad4caea193 100644 --- a/src/cpu/kernels/CpuDirectConv2dKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel> { private: - using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type; + using DirectConv2dKernel_Ptr = std::add_pointer<void( + const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type; public: CpuDirectConv2dKernel() = default; @@ -64,10 +65,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DirectConv2dKernel @@ -81,8 +85,8 @@ public: private: PadStrideInfo _conv_info{}; - unsigned int _kernel_size{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; + unsigned int _kernel_size{0}; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp index 93ad5e5eba..d4af8bedaf 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp @@ -27,15 +27,16 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> #include <cstddef> @@ -49,7 +50,9 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -57,22 +60,23 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); - ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL))); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index( + src->data_layout(), DataLayoutDimension::CHANNEL))); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); } - if(src->data_type() == DataType::S32) + if (src->data_type() == DataType::S32) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); } // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { - if(is_data_type_float(src->data_type())) + if (is_data_type_float(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -82,10 +86,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } - else if(src->data_type() == DataType::S32) + else if (src->data_type() == DataType::S32) { // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo - ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && + (info.output_data_type != DataType::QASYMM8_SIGNED)); } return Status{}; @@ -93,8 +98,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const template <typename T> typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; /** SIMD vector tag type. */ @@ -113,50 +123,57 @@ output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITens Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; - auto v_in = wrapper::vloadq(in_ptr); - - // Accumulate bias - if(has_bias) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); - v_in = wrapper::vadd(v_in, vb); - } + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; + auto v_in = wrapper::vloadq(in_ptr); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; - wrapper::vstore(out_ptr, v_in); - } + // Accumulate bias + if (has_bias) + { + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); + v_in = wrapper::vadd(v_in, vb); + } - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; + wrapper::vstore(out_ptr, v_in); + } - // Accumulate bias - if(has_bias) + // Left-overs loop + for (; x < window_end_x; ++x) { - const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; - } + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); - *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; - } + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } - }, - in, out); + *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; + } + }, + in, out); } template <typename T> typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type -output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); @@ -179,50 +196,59 @@ output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITens Iterator bi(bias, window_bias); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); - auto v_in = wrapper::vloadq(in_ptr + x); - - // Accumulate bias - if(has_bias) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; - v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); - } + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); + auto v_in = wrapper::vloadq(in_ptr + x); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - wrapper::vstore(out_ptr + x, v_in); - } + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); + } - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + wrapper::vstore(out_ptr + x, v_in); + } - // Accumulate bias - if(has_bias) + // Left-overs loop + for (; x < window_end_x; ++x) { - const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; - s_in += *bias_ptr; - } + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); - const auto out_ptr = reinterpret_cast<T *>(out.ptr()); - *(out_ptr + x) = s_in; - } - }, - in, bi, out); + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + *(out_ptr + x) = s_in; + } + }, + in, bi, out); } // Quantized case -template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > -void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +template < + typename TOut, + typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> +void output_stage_nchw(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; @@ -242,67 +268,63 @@ void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates & id) - { - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32x4x4_t v_in = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12)}}; + + // Accumulate bias + if (has_bias) { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12) + const auto vb = wrapper::vdup_n( + *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); + v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb), + wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}}; } - }; - // Accumulate bias - if(has_bias) - { - const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); - v_in = - { - { - wrapper::vadd(v_in.val[0], vb), - wrapper::vadd(v_in.val[1], vb), - wrapper::vadd(v_in.val[2], vb), - wrapper::vadd(v_in.val[3], vb) - } - }; + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); } - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, - min, max, false)); - } + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + // Accumulate bias + if (has_bias) + { + const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } - // Accumulate bias - if(has_bias) - { - const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); - s_in += b; + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); - } - }, - in, out); + }, + in, out); } -template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > -void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +template < + typename TOut, + typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0> +void output_stage_nhwc(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift) { const bool has_bias = bias != nullptr; using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; @@ -329,62 +351,65 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, Iterator bi(bias, window_bias); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32x4x4_t v_in = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = {{ + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12), + }}; + + // Accumulate bias + if (has_bias) { - wrapper::vloadq(in_ptr), - wrapper::vloadq(in_ptr + 4), - wrapper::vloadq(in_ptr + 8), - wrapper::vloadq(in_ptr + 12), - } - }; + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); + wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); + wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); + wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + } - wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); - wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); - wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); - wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, + result_offset_after_shift_s32, min, max, false)); } - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); - } + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32_t s_in = *in_ptr; - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Get bias and pointer to input - const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; - int32_t s_in = *in_ptr; + // Accumulate bias + if (has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + s_in += *bias_ptr; + } - // Accumulate bias - if(has_bias) - { - const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; - s_in += *bias_ptr; + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = + finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); } - - const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; - *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, - std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); - } - }, - in, bi, out); + }, + in, bi, out); } } // namespace -void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_UNUSED(bias); @@ -398,7 +423,7 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor _result_offset_after_shift = info.result_offset_after_shift; // Auto-initialize output output if required - if(dst != nullptr) + if (dst != nullptr) { // Work out expected output data type const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; @@ -410,16 +435,17 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor ICpuKernel::configure(win); - const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; + const bool is_qasymm8_signed = + (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; // Set appropriate function - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::S32: { - if(is_qasymm8_signed) + if (is_qasymm8_signed) { _func = &output_stage_nchw<int8_t>; } @@ -449,11 +475,11 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor } else { - switch(src->data_type()) + switch (src->data_type()) { case DataType::S32: { - if(is_qasymm8_signed) + if (is_qasymm8_signed) { _func = &output_stage_nhwc<int8_t>; } @@ -483,7 +509,9 @@ void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensor } } -Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const DirectConvolutionLayerOutputStageKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h index d3ef17b7c9..ce84f49cf6 100644 --- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -55,29 +56,40 @@ public: * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata */ - void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + void + configure(ITensorInfo *src, + const ITensorInfo *bias = nullptr, + ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv2dOutputStageKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr, - const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + static Status + validate(const ITensorInfo *src, + const ITensorInfo *bias = nullptr, + const ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, - int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); + using OutputStageKernel = void(ITensor *src, + const ITensor *bias, + const Window &window, + ITensor *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift); - OutputStageKernel *_func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; + OutputStageKernel *_func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp index 22c60cd994..b5b2aed1ba 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp +++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/conv3d/neon/list.h" #include <algorithm> @@ -49,43 +50,37 @@ namespace kernels { namespace { -static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = -{ +static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = { #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>) - }, + {"neon_fp16_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)}, #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ - { - "neon_fp32_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>) - }, - { - "neon_qasymm8_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>) - }, - { - "neon_qasymm8_signed_directconv3d", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>) - } -}; - -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info) + {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)}, + {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)}, + {"neon_qasymm8_signed_directconv3d", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}}; + +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U)); - const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -96,9 +91,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx)); - if(src2 != nullptr) + if (src2 != nullptr) { - if(is_data_type_quantized(src0->data_type())) + if (is_data_type_quantized(src0->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32); } @@ -106,14 +101,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), + "Biases size and number of dst feature maps should match"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); DataType data_type = src0->data_type(); @@ -125,12 +122,17 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } // namespace -void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info) +void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_UNUSED(src2); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -139,7 +141,8 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo _name = std::string("CpuDirectConv3dKernel").append("/").append(uk->name); // Get convolved dimensions - TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); + TensorShape output_shape = + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info); DataType data_type = src0->data_type(); @@ -154,7 +157,11 @@ void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo ICpuKernel::configure(win); } -Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info) +Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info)); @@ -188,4 +195,4 @@ const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> &CpuDirectConv3dKer } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h index 688f368b9f..8e6f564679 100644 --- a/src/cpu/kernels/CpuDirectConv3dKernel.h +++ b/src/cpu/kernels/CpuDirectConv3dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,7 +40,8 @@ class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel> { private: /* Template function for convolution 3d NDHWC */ - using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type; + using DirectConv3dKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type; public: CpuDirectConv3dKernel() = default; @@ -63,17 +65,25 @@ public: * @param[in] conv_info Contains padding, stride, acitvation information. * */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv3dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct DirectConv3dKernel @@ -87,7 +97,7 @@ public: private: Conv3dInfo _conv_info{}; - DirectConv3dKernelPtr _run_method{ nullptr }; + DirectConv3dKernelPtr _run_method{nullptr}; std::string _name{}; }; diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp index a045855b1a..57a3f39822 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -24,8 +24,9 @@ #include "src/cpu/kernels/CpuElementwiseKernel.h" #include "arm_compute/core/Helpers.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/elementwise_binary/list.h" @@ -35,11 +36,11 @@ #if defined(ENABLE_FP32_KERNELS) namespace { - static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; - static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; - static constexpr size_t default_div_mws_N1_fp32_neon = 19043; - static constexpr size_t default_div_mws_V1_fp32_neon = 25511; -} +static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308; +static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772; +static constexpr size_t default_div_mws_N1_fp32_neon = 19043; +static constexpr size_t default_div_mws_V1_fp32_neon = 25511; +} // namespace #endif /* ENABLE_FP32_KERNELS */ namespace arm_compute @@ -50,255 +51,178 @@ namespace kernels { namespace { -template <ArithmeticOperation op> -const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = -{ - { - "sve2_qu8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>) - }, - { - "sve2_qs8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>) - }, - { - "sve_fp32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>) - }, - { - "sve_s32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>) - }, - { - "sve_s16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>) - }, - { - "sve_fp16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>) - }, - { - "neon_fp32_arithmetic", - - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>) - }, - { - "neon_s32_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>) - }, - { - "neon_fp16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>) - }, - { - "neon_s16_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>) - }, - { - "neon_qu8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>) - }, - { - "neon_qs8_arithmetic", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>) - }, +template <ArithmeticOperation op> +const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = { + {"sve2_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)}, + {"sve2_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)}, + {"sve_fp32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)}, + {"sve_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)}, + {"sve_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)}, + {"sve_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast<ArithmeticOperation>(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)}, + {"neon_fp32_arithmetic", + + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)}, + {"neon_s32_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)}, + {"neon_fp16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)}, + {"neon_s16_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)}, + {"neon_qu8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)}, + {"neon_qs8_arithmetic", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)}, }; -template <ComparisonOperation op> -const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = -{ - { - "sve2_qu8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>) - }, - { - "sve2_qs8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>) - }, - { - "sve_u8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>) - }, - { - "sve_fp32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>) - }, - { - "sve_s16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>) - }, - { - "sve_s32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>) - }, - { - "sve_fp16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>) - }, - { - "neon_u8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>) - }, - { - "neon_fp32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>) - }, - { - "neon_s16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>) - }, - { - "neon_s32_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>) - }, - { - "neon_qu8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>) - }, - { - "neon_qs8_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>) - }, - { - "neon_fp16_comparison", - [](const ElementwiseDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; - }, - REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>) - }, +template <ComparisonOperation op> +const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = { + {"sve2_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)}, + {"sve2_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; + }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)}, + {"sve_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)}, + {"sve_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)}, + {"sve_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)}, + {"sve_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)}, + {"sve_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + static_cast<ComparisonOperation>(data.op) == op; + }, + REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)}, + {"neon_u8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)}, + {"neon_fp32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)}, + {"neon_s16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)}, + {"neon_s32_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)}, + {"neon_qu8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)}, + {"neon_qs8_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)}, + {"neon_fp16_comparison", + [](const ElementwiseDataTypeISASelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; }, + REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)}, }; } // namespace -const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels() +const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> & +CpuArithmeticKernel::get_available_kernels() { static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels; - std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), + available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), + available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), + available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), + available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), + available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), + available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), + available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), + available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels)); return available_kernels; } -const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels() +const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> & +CpuComparisonKernel::get_available_kernels() { static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels; - std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels)); - std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), available_kernels_comperison<ComparisonOperation::LessEqual>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), + available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), + available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), + available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), + available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), + std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), + available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels)); + std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), + available_kernels_comperison<ComparisonOperation::LessEqual>.end(), + std::back_inserter(available_kernels)); return available_kernels; } template <class Derived> -Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, + const ITensorInfo &src1, + const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); @@ -308,7 +232,7 @@ Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for output"); @@ -321,7 +245,8 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) }); + const auto *uk = CpuArithmeticKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -329,7 +254,7 @@ void CpuArithmeticKernel::configure_common(const ITensorInfo *src0, const ITenso _name = std::string("CpuArithmeticKernel").append("/").append(uk->name); // If any of shapes is dynamic, expect a configured window and dst at run-time. - if(src0->is_dynamic() || src1->is_dynamic()) + if (src0->is_dynamic() || src1->is_dynamic()) { return; } @@ -343,7 +268,8 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) }); + const auto *uk = CpuComparisonKernel::get_implementation( + ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -351,7 +277,7 @@ void CpuComparisonKernel::configure_common(const ITensorInfo *src0, const ITenso _name = std::string("CpuComparisonKernel").append("/").append(uk->name); // If any of shapes is dynamic, expect a configured window and dst at run-time. - if(src0->is_dynamic() || src1->is_dynamic()) + if (src0->is_dynamic() || src1->is_dynamic()) { return; } @@ -373,8 +299,10 @@ void CpuElementwiseKernel<Derived>::run_op(ITensorPack &tensors, const Window &w _run_method(src0, src1, dst, window); } -template void CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); -template void CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); +template void +CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); +template void +CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info); template <class Derived> const char *CpuElementwiseKernel<Derived>::name() const @@ -385,7 +313,10 @@ template const char *CpuElementwiseKernel<CpuArithmeticKernel>::name() const; template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const; /** Arithmetic operators (min, max, squared_diff) */ -void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +void CpuArithmeticKernel::configure(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); _op = op; @@ -394,16 +325,20 @@ void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *s Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::F16, DataType::S32, DataType::F32); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); } return validate_arguments_common(src0, src1, dst); } -Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status CpuArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); @@ -416,15 +351,15 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> - || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>) + if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> || + this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_min_max_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_min_max_mws_V1_fp32_neon; } @@ -434,7 +369,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -447,7 +382,7 @@ size_t CpuArithmeticKernel::get_mws(const CPUInfo &platform, size_t thread_count return std::max(static_cast<size_t>(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; @@ -467,14 +402,14 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>) + if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_div_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_div_mws_V1_fp32_neon; } @@ -484,7 +419,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -497,7 +432,7 @@ size_t CpuDivisionKernel::get_mws(const CPUInfo &platform, size_t thread_count) return std::max(static_cast<size_t>(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ return ICPPKernel::default_mws; @@ -538,7 +473,10 @@ Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1 } /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */ -void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +void CpuComparisonKernel::configure(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); _op = op; @@ -547,16 +485,21 @@ void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *s Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, + DataType::S32, DataType::F32); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8); } return validate_arguments_common(src0, src1, dst); } -Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status CpuComparisonKernel::validate(ComparisonOperation op, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h index 634e38bf9f..1f3e613b80 100644 --- a/src/cpu/kernels/CpuElementwiseKernel.h +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -43,7 +43,8 @@ template <class Derived> class CpuElementwiseKernel : public ICpuKernel<Derived> { private: - using ElementwiseKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type; + using ElementwiseKernelPtr = + std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type; public: CpuElementwiseKernel() = default; @@ -72,7 +73,7 @@ protected: static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); protected: - ElementwiseKernelPtr _run_method{ nullptr }; + ElementwiseKernelPtr _run_method{nullptr}; std::string _name{}; }; @@ -96,7 +97,8 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + static Status + validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels(); @@ -200,7 +202,8 @@ public: * * @return a status */ - static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + static Status + validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels(); @@ -226,4 +229,4 @@ private: } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp index 04a7f15715..88545ee756 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/elementwise_unary/list.h" @@ -59,12 +60,13 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale; const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale; - for(int i = 0; i < 256; ++i) + for (int i = 0; i < 256; ++i) { - const auto in = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi); - float result = 0; + const auto in = + (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi); + float result = 0; - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: result = 1 / sqrt(in); @@ -100,7 +102,8 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo result = utility::clamp(result, dst_min_fp, dst_max_fp); - const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi); + const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) + : quantize_qasymm8(result, dst_qi); lut[i] = out; } @@ -109,97 +112,68 @@ std::unique_ptr<uint8_t[]> q8_prepare_lut(ElementWiseUnary op, const ITensorInfo #endif // __aarch64__ -static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = -{ +static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = { { "sve_fp32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32 && data.isa.sve); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); }, REGISTER_FP32_SVE(sve_fp32_elementwise_unary), nullptr, }, { "sve_fp16_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); }, REGISTER_FP16_SVE(sve_fp16_elementwise_unary), nullptr, }, { "sve_s32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::S32 && data.isa.sve); - }, + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); }, REGISTER_INTEGER_SVE(sve_s32_elementwise_unary), nullptr, }, { "neon_fp32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::F32; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, REGISTER_FP32_NEON(neon_fp32_elementwise_unary), nullptr, }, { "neon_fp16_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.fp16; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, REGISTER_FP16_NEON(neon_fp16_elementwise_unary), nullptr, }, { "neon_s32_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::S32; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; }, REGISTER_INTEGER_NEON(neon_s32_elementwise_unary), nullptr, }, #ifdef __aarch64__ { "sve2_q8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; - }, + [](const DataTypeISASelectorData &data) + { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary), &q8_prepare_lut, }, { "neon_q8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; - }, + [](const DataTypeISASelectorData &data) + { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary), &q8_prepare_lut, }, #else // __aarch64__ { "neon_qasymm8_signed_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary), nullptr, }, { "neon_qasymm8_elementwise_unary", - [](const DataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8; - }, + [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary), nullptr, }, @@ -211,7 +185,8 @@ static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> avai void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) { ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst)); - const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _op = op; @@ -219,12 +194,12 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo _name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name); // If input shape is dynamic, expect a configured window and dst at run-time. - if(src.is_dynamic()) + if (src.is_dynamic()) { return; } - if(uk->prepare_func != nullptr) + if (uk->prepare_func != nullptr) { _lut = uk->prepare_func(op, &src, &dst); } @@ -238,28 +213,31 @@ Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInf { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuElementwiseUnaryKernel::get_implementation( + DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - switch(op) + switch (op) { case ElementWiseUnary::EXP: case ElementWiseUnary::RSQRT: case ElementWiseUnary::LOG: case ElementWiseUnary::ROUND: case ElementWiseUnary::SIN: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); break; case ElementWiseUnary::NEG: case ElementWiseUnary::ABS: - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); break; default: ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported"); } // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); } diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h index 00188f0d49..249909854e 100644 --- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -42,8 +43,10 @@ namespace kernels class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel> { private: - using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type; - using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type; + using ElementwiseUnaryUkernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type; + using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>( + ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type; public: CpuElementwiseUnaryKernel() = default; @@ -65,7 +68,7 @@ public: static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct ElementwiseUnaryKernel @@ -80,7 +83,7 @@ public: private: ElementWiseUnary _op{}; - ElementwiseUnaryUkernelPtr _run_method{ nullptr }; + ElementwiseUnaryUkernelPtr _run_method{nullptr}; std::string _name{}; std::unique_ptr<uint8_t[]> _lut{}; }; diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp index f69de0082d..754da97ae1 100644 --- a/src/cpu/kernels/CpuFillKernel.cpp +++ b/src/cpu/kernels/CpuFillKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -68,17 +69,18 @@ void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const Thr collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator tensor_it(inout, collapsed); - execute_window_loop(collapsed, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + tensor_it.offset(); - // Set memory - for(int i = 0; i < window_width; ++i) + execute_window_loop( + collapsed, + [&](const Coordinates &) { - std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); - } - - }, - tensor_it); + uint8_t *base_addr = start_valid_region + tensor_it.offset(); + // Set memory + for (int i = 0; i < window_width; ++i) + { + std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); + } + }, + tensor_it); } const char *CpuFillKernel::name() const diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h index ce41afc462..7c200c9b59 100644 --- a/src/cpu/kernels/CpuFillKernel.h +++ b/src/cpu/kernels/CpuFillKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_FILL_KERNEL_H #include "arm_compute/core/PixelValue.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -48,7 +49,7 @@ public: void configure(const ITensorInfo *tensor, const PixelValue &constant_value); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp index 65e390a81a..df7e6aad46 100644 --- a/src/cpu/kernels/CpuFloorKernel.cpp +++ b/src/cpu/kernels/CpuFloorKernel.cpp @@ -27,11 +27,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/floor/list.h" namespace arm_compute @@ -42,29 +42,22 @@ namespace kernels { namespace { -static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = -{ - { - "neon_fp16_floor", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor) - }, - { - "neon_fp32_floor", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor) - } -}; +static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = { + {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)}, + {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); // Validate in case of configured output - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -81,7 +74,8 @@ void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); - const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; @@ -122,17 +116,14 @@ void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const Th ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src_it(src, win); Iterator dst_it(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - _run_method(src_it.ptr(), dst_it.ptr(), len); - }, - src_it, dst_it); + execute_window_loop( + win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it); } const char *CpuFloorKernel::name() const diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h index 35ab534ca8..57107d0532 100644 --- a/src/cpu/kernels/CpuFloorKernel.h +++ b/src/cpu/kernels/CpuFloorKernel.h @@ -65,7 +65,7 @@ public: Window infer_window(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct FloorKernel @@ -78,7 +78,7 @@ public: static const std::vector<FloorKernel> &get_available_kernels(); private: - FloorKernelPtr _run_method{ nullptr }; + FloorKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp index 9fbf2d54c6..db433c99a8 100644 --- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp @@ -24,9 +24,10 @@ #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,7 +61,7 @@ Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITenso //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = compute_interleaved_shape(*src); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -111,35 +112,42 @@ void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &wind Iterator in(src, win); Iterator out(dst, win_out); - execute_window_loop(win, [&](const Coordinates & id) - { - if(id.y() + 4 <= static_cast<int>(in_height)) + execute_window_loop( + win, + [&](const Coordinates &id) { - for(size_t x = window_start_x; x < window_end_x; ++x) + if (id.y() + 4 <= static_cast<int>(in_height)) { - std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size); - std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size); - } - } - else - { - for(size_t x = window_start_x; x < window_end_x; ++x) - { - size_t y = 0; - for(; y < partial_y; ++y) + for (size_t x = window_start_x; x < window_end_x; ++x) { - std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size); + std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, + element_size); + std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, + element_size); } - for(; y < 4; ++y) + } + else + { + for (size_t x = window_start_x; x < window_end_x; ++x) { - std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + size_t y = 0; + for (; y < partial_y; ++y) + { + std::memcpy(out.ptr() + (x * 4 + y) * element_size, + (in.ptr() + y * in_stride) + x * element_size, element_size); + } + for (; y < 4; ++y) + { + std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + } } } - } - }, - in, out); + }, + in, out); } const char *CpuGemmInterleave4x4Kernel::name() const diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h index 4fb6a52a8b..2ce34bc4bc 100644 --- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -71,7 +71,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp index f8bef64066..a3ed2cd171 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,646 +45,494 @@ namespace kernels { namespace { -void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +void inline vector_matrix_multiply_u8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) { - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { + if (id.x() > width_b) { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) + return; } - }; - - auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr()); - auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const uint8x8_t a00_u8 = vld1_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); - const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); - const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); - const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); - const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); - const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); - const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); - const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4x2_t a00_u16 = - { - { - vget_low_u16(vmovl_u8(a00_u8)), - vget_high_u16(vmovl_u8(a00_u8)) - } - }; - - const uint16x4x4_t b00_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - const uint16x4x4_t b10_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b10_u8))) - } - }; - - const uint16x4x4_t b20_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b20_u8))) - } - }; - const uint16x4x4_t b30_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b30_u8))) - } - }; + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const uint8x8_t a00_u8 = vld1_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); + const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); + const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); + const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); + const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); + const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); + const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); + const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}}; + + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + const uint16x4x4_t b10_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}}; + + const uint16x4x4_t b20_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}}; + + const uint16x4x4_t b30_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}}; + + const uint16x4x4_t b40_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}}; + + const uint16x4x4_t b50_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}}; + + const uint16x4x4_t b60_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}}; + + const uint16x4x4_t b70_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } - const uint16x4x4_t b40_u16 = + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) { - { - vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b40_u8))) - } - }; + const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b); - const uint16x4x4_t b50_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b50_u8))) - } - }; + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; - const uint16x4x4_t b60_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b60_u8))) - } - }; + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - const uint16x4x4_t b70_u16 = - { - { - vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b70_u8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); - c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); - c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); - c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); - c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); - c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); - c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); - c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); - c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); - c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); - const uint8x16_t b00_u8 = vld1q_u8(matrix_b); + vec_a += 1; + matrix_b += stride_b; + } - const uint16x4x4_t b00_u16 = + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.x() < (width_out - 16)) { - { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vec_out + k * 4 + j) = c0.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +void inline vector_matrix_multiply_s8(Iterator &ina, + Iterator &inb, + Iterator &out, + int width_a, + int width_b, + int width_out, + size_t stride_b, + const Window &window) { - execute_window_loop(window, [&](const Coordinates & id) - { - if(id.x() > width_b) - { - return; - } - - // Accumulators for the block 0 - int32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { + if (id.x() > width_b) { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + return; } - }; - - auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr()); - auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr()); - auto vec_a_end_addr = vec_a + width_a; - - // This for loop performs 8 accumulations - for(; vec_a <= (vec_a_end_addr - 8);) - { - const int8x8_t a00_s8 = vld1_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); - const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); - const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); - const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); - const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); - const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); - const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); - const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); - - // Convert a00_s8 to int16_t and get the lower part - const int16x4x2_t a00_s16 = - { - { - vget_low_s16(vmovl_s8(a00_s8)), - vget_high_s16(vmovl_s8(a00_s8)) - } - }; - - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - const int16x4x4_t b10_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b10_s8))) - } - }; - const int16x4x4_t b20_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b20_s8))) - } - }; + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for (; vec_a <= (vec_a_end_addr - 8);) + { + const int8x8_t a00_s8 = vld1_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); + const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); + const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); + const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); + const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); + const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); + const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); + const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); + + // Convert a00_s8 to int16_t and get the lower part + const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}}; + + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + const int16x4x4_t b10_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}}; + + const int16x4x4_t b20_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}}; + + const int16x4x4_t b30_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}}; + + const int16x4x4_t b40_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}}; + + const int16x4x4_t b50_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}}; + + const int16x4x4_t b60_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}}; + + const int16x4x4_t b70_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}}; + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } - const int16x4x4_t b30_s16 = + // This for loop performs the left-over accumulations + for (; vec_a < vec_a_end_addr;) { - { - vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b30_s8))) - } - }; + const int8x8_t a00_s8 = vld1_dup_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b); - const int16x4x4_t b40_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b40_s8))) - } - }; + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; - const int16x4x4_t b50_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b50_s8))) - } - }; + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - const int16x4x4_t b60_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b60_s8))) - } - }; - - const int16x4x4_t b70_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b70_s8))) - } - }; - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); - - // Accumulate 1: - c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); - - // Accumulate 2: - c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); - - // Accumulate 3: - c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); - - // Accumulate 4: - c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); - - // Accumulate 5: - c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); - c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); - c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); - c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); - - // Accumulate 6: - c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); - c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); - c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); - c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); - - // Accumulate 7: - c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); - c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); - c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); - c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); - - vec_a += 8; - matrix_b += 8 * stride_b; - } + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - // This for loop performs the left-over accumulations - for(; vec_a < vec_a_end_addr;) - { - const int8x8_t a00_s8 = vld1_dup_s8(vec_a); - const int8x16_t b00_s8 = vld1q_s8(matrix_b); + vec_a += 1; + matrix_b += stride_b; + } - const int16x4x4_t b00_s16 = + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.x() < (width_out - 16)) { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Accumulate 0: - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - vec_a += 1; - matrix_b += stride_b; - } - - auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); - if(id.x() < (width_out - 16)) - { - vst1q_s32(vec_out + 0, c0.val[0]); - vst1q_s32(vec_out + 4, c0.val[1]); - vst1q_s32(vec_out + 8, c0.val[2]); - vst1q_s32(vec_out + 12, c0.val[3]); - } - else - { - auto left_over = width_out - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + vst1q_s32(vec_out + 0, c0.val[0]); + vst1q_s32(vec_out + 4, c0.val[1]); + vst1q_s32(vec_out + 8, c0.val[2]); + vst1q_s32(vec_out + 12, c0.val[3]); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_out - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vec_out + k * 4 + j) = c0.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +void inline matrix_multiply_u8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) { const auto width_out = static_cast<int>(out_info.dimension(0)); const auto height_out = static_cast<int>(out_info.dimension(1)); const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *mtx_a0 = ina.ptr(); - const uint8_t *mtx_b0 = inb.ptr(); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - uint32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 1 + uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 2 + uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + // Accumulators for the block 3 + uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const uint8x8_t a00_u8 = vld1_u8(mtx_a0); + const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Convert b00_s8 to uint16_t + const uint16x4x4_t b00_u16 = { + {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); + c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); + c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); + c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); + c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); + c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); + c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); + c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); + c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); + c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); } - }; - // Accumulators for the block 1 - uint32x4x4_t c1 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); - // Accumulators for the block 2 - uint32x4x4_t c2 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - // Accumulators for the block 3 - uint32x4x4_t c3 = - { - { - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0), - vdupq_n_u32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const uint8x8_t a00_u8 = vld1_u8(mtx_a0); - const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); - - // Convert a00_u8 to uint16_t and get the lower part - const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); - - // Convert b00_s8 to uint16_t - const uint16x4x4_t b00_u16 = + if (id.y() < height_out && id.x() < (width_out - 16)) { + vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); + if (id.y() + 1 < height_out) { - vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), - vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), - vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); - c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); - c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); - c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); - c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); - c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); - c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); - c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); - c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); - c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); - c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); - c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); - c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); - } - - auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); - - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); - vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); - vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); - vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); - vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); - vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); - vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); - vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); - vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); - vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); - if(id.y() + 3 < height_out) + vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); + vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); + vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); + vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); + if (id.y() + 2 < height_out) { - vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); - vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); - vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); - vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); + vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); + vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); + vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); + vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); + vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); + vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + } } } } - } - else - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) - { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + *(mtx_out + k * 4 + j) = c0.val[k][j]; } } - if(id.y() + 2 < height_out) + if (id.y() + 1 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; } } - if(id.y() + 3 < height_out) + if (id.y() + 2 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +void inline matrix_multiply_s8( + Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) { const auto width_out = static_cast<int>(out_info.dimension(0)); const auto height_out = static_cast<int>(out_info.dimension(1)); @@ -691,182 +540,148 @@ void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr()); - auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr()); - - // Note: Since the input are all positives, we can use uint32_t - // Accumulators for the block 0 - int32x4x4_t c0 = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) + auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr()); + auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr()); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 1 + int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 2 + int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + // Accumulators for the block 3 + int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}}; + + for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const int8x8_t a00_s8 = vld1_s8(mtx_a0); + const int8x16_t b00_s8 = vld1q_s8(mtx_b0); + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Convert b00_s8 to int16_t + const int16x4x4_t b00_s16 = { + {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}}; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); + c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); + c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); + c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); + c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); + c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); + c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); + c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); + c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); + c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); } - }; - - // Accumulators for the block 1 - int32x4x4_t c1 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 2 - int32x4x4_t c2 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - // Accumulators for the block 3 - int32x4x4_t c3 = - { - { - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0), - vdupq_n_s32(0) - } - }; - - for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) - { - const int8x8_t a00_s8 = vld1_s8(mtx_a0); - const int8x16_t b00_s8 = vld1q_s8(mtx_b0); - - // Convert a00_s8 to uint16_t and get the lower part - const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); - - // Convert b00_s8 to int16_t - const int16x4x4_t b00_s16 = - { - { - vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), - vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), - vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) - } - }; - - // 4x4 block 0 - c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); - c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); - c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); - c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); - - // 4x4 block 1 - c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); - c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); - c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); - c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); - - // 4x4 block 2 - c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); - c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); - c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); - c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); - - // 4x4 block 3 - c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); - c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); - c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); - c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); - } - auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); - if(id.y() < height_out && id.x() < (width_out - 16)) - { - vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); - vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); - vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); - vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); - if(id.y() + 1 < height_out) - { - vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); - vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); - vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); - vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); - if(id.y() + 2 < height_out) - { - vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); - vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); - vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); - vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); - if(id.y() + 3 < height_out) + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); + if (id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); + vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); + vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); + vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); + if (id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); + vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); + vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); + vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); + if (id.y() + 2 < height_out) { - vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); - vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); - vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); - vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); + vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); + vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); + vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); + if (id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); + vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); + vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); + vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + } } } } - } - else if(id.y() < height_out) - { - const auto left_over_value = width_out - id.x(); - auto left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) - { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) - { - *(mtx_out + k * 4 + j) = c0.val[k][j]; - } - } - if(id.y() + 1 < height_out) + else if (id.y() < height_out) { - left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + *(mtx_out + k * 4 + j) = c0.val[k][j]; } } - if(id.y() + 2 < height_out) + if (id.y() + 1 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; } } - if(id.y() + 3 < height_out) + if (id.y() + 2 < height_out) { left_over = left_over_value; - for(auto k = 0; k < 4 && left_over; ++k) + for (auto k = 0; k < 4 && left_over; ++k) { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + for (auto j = 0; j < 4 && left_over; ++j, --left_over) { - *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if (id.y() + 3 < height_out) + { + left_over = left_over_value; + for (auto k = 0; k < 4 && left_over; ++k) + { + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } } } } } } - } - - }, - ina, inb, out); + }, + ina, inb, out); } Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); TensorShape in0_shape = src0->tensor_shape(); @@ -874,9 +689,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons TensorShape out_shape = dst->tensor_shape(); // Check vector-by-matrix case - if(out_shape[1] == 1) + if (out_shape[1] == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], + "The number of input0's columns must be equal to input1's rows"); } else { @@ -884,8 +700,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons in1_shape.collapse(2); out_shape.collapse(2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], + "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], + "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16"); } @@ -909,20 +728,22 @@ void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const I Window win; // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication - if((dst->dimension(1) == 1)) + if ((dst->dimension(1) == 1)) { // Configure kernel window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); } else { - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); } ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status +CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); return Status{}; @@ -939,12 +760,13 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window auto dst = tensors.get_tensor(TensorType::ACL_DST); // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path - if((dst->info()->dimension(1) == 1)) + if ((dst->info()->dimension(1) == 1)) { const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0)); const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0)); const auto width_out = static_cast<int>(dst->info()->dimension(0)); - const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); + const auto in_b_stride = + static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); // The implementation computes 16 elements per iteration const int window_start_x = 16 * info.thread_id; @@ -963,7 +785,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(src1->info()->num_dimensions() >= 3) + if (src1->info()->num_dimensions() >= 3) { win_b = window; } @@ -974,18 +796,20 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Iterator inb(src1, win_b); Iterator out(dst, win_out); - switch(src0->info()->data_type()) + switch (src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: { - vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); break; } case DataType::U8: case DataType::QASYMM8: { - vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, + window); break; } default: @@ -1009,7 +833,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(_slide_matrix_b) + if (_slide_matrix_b) { win_b = window; } @@ -1021,7 +845,7 @@ void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window Iterator inb(src1, win_b); Iterator out(dst, window); - switch(src0->info()->data_type()) + switch (src0->info()->data_type()) { case DataType::S8: case DataType::QASYMM8_SIGNED: @@ -1050,4 +874,4 @@ const char *CpuGemmLowpMatrixMultiplyKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h index 2cc789d6d9..439ada1b47 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -68,11 +68,11 @@ public: static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - bool _slide_matrix_b{ true }; + bool _slide_matrix_b{true}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp index 534076b97c..9bd1eae663 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -38,37 +39,49 @@ namespace kernels { namespace { -Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(1), + "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } -Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(0), + "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } } // namespace -void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -77,7 +90,7 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso _scalar = info.scalar; _mul_by_scalar = info.mul_by_scalar; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>; @@ -98,14 +111,18 @@ void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITenso ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); return Status{}; } template <typename T> -void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window) +void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const arm_compute::Window &window) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t<T>; @@ -121,55 +138,58 @@ void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor Iterator in(src, win_input); Iterator out(dst, collapsed_window); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); - TAcc sum_row = 0; + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; - const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); + const T *matrix_a = reinterpret_cast<const T *>( + (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 16 accumulations - for(; i <= (_k - 16); i += 16) - { - const auto a0_d8 = wrapper::vloadq(matrix_a + i); + int i = 0; + // This for loop performs 16 accumulations + for (; i <= (_k - 16); i += 16) + { + const auto a0_d8 = wrapper::vloadq(matrix_a + i); - // Partial accumulations in U16 - const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); + // Partial accumulations in U16 + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); - // Accumulate to U32 - vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); - } + // Accumulate to U32 + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); + } - // This for loop performs the leftover accumulations - for(; i < _k; ++i) - { - sum_row += static_cast<TAcc>(matrix_a[i]); - } + // This for loop performs the leftover accumulations + for (; i < _k; ++i) + { + sum_row += static_cast<TAcc>(matrix_a[i]); + } #if defined(__aarch64__) - // Reduction operation available on 64 bit architectures only - sum_row += wrapper::vaddv(vsum_row); + // Reduction operation available on 64 bit architectures only + sum_row += wrapper::vaddv(vsum_row); #else // __aarch64__ - auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); - tmp = wrapper::vpadd(tmp, tmp); + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); - sum_row += wrapper::vgetlane(tmp, 0); + sum_row += wrapper::vgetlane(tmp, 0); #endif // __aarch64__ - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_row *= _scalar; - } + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_row *= _scalar; + } - *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row); - }, - in, out); + *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row); + }, + in, out); } void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) @@ -189,7 +209,9 @@ const char *CpuGemmLowpMatrixAReductionKernel::name() const return "CpuGemmLowpMatrixAReductionKernel"; } -void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); @@ -201,7 +223,7 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso // Configure kernel window constexpr unsigned int num_elems_processed_per_iteration = 16; - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>; @@ -223,14 +245,19 @@ void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITenso ICpuKernel::configure(win); } -Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); return Status{}; } template <typename T> -void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info) +void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info) { // Intermediate and final accumulator types using TIAcc = wrapper::traits::promote_t<T>; @@ -258,121 +285,116 @@ void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor Iterator inb(src, win_in); Iterator out(dst, win_out); - execute_window_loop(win_out, [&](const Coordinates & id) - { - if(id.x() > width_matrix_b) + execute_window_loop( + win_out, + [&](const Coordinates &id) { - return; - } + if (id.x() > width_matrix_b) + { + return; + } - // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation - typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = - { - wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}) - }; + // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation + typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = { + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})}; - const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); + const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); - asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); #endif /* __arm__ */ - int i = 0; - // This for loop performs 4 accumulations - for(; i <= (_k - 4); i += 4) - { - const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); - const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); - const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); + int i = 0; + // This for loop performs 4 accumulations + for (; i <= (_k - 4); i += 4) + { + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); #if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); - asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); #endif /* __arm__ */ - // Partial accumulation in 16bit - typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = - { - wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}), - wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}) - }; - - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); - tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); - tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); - - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); - - matrix_b += 4 * in_b_stride; - } - - // This for loop perfoms the leftover accumulations - for(; i < _k; ++i) - { - const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = { + wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})}; + + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); + + matrix_b += 4 * in_b_stride; + } - // Convert S8 to S16 - const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] + // This for loop perfoms the leftover accumulations + for (; i < _k; ++i) { - wrapper::vmovl(wrapper::vgetlow(b0_b8)), - wrapper::vmovl(wrapper::vgethigh(b0_b8)) - }; + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); - // Accumulate to 32bit - sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); - sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); - sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); - sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); + // Convert S8 to S16 + const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]{ + wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))}; - matrix_b += in_b_stride; - } + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); - // Multiply by scalar if necessary - if(_mul_by_scalar) - { - sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); - sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); - sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); - sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); - } - - auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); - if(id.x() + 16 < width_matrix_b) - { - wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); - wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); - wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); - wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); - } - else - { - auto left_over = width_matrix_b - id.x(); - for(auto k = 0; k < 4 && left_over; ++k) + matrix_b += in_b_stride; + } + + // Multiply by scalar if necessary + if (_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + + auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); + if (id.x() + 16 < width_matrix_b) + { + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); + } + else { - for(auto j = 0; j < 4 && left_over; ++j, --left_over) + auto left_over = width_matrix_b - id.x(); + for (auto k = 0; k < 4 && left_over; ++k) { - *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + for (auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + } } } - } - }, - inb, out); + }, + inb, out); } void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) @@ -393,4 +415,4 @@ const char *CpuGemmLowpMatrixBReductionKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h index e469629cdb..20ef17e96d 100644 --- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -66,7 +66,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -85,12 +85,14 @@ private: * @param[out] dst Output tensor * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). */ - using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window); - CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; + CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; }; /** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. @@ -124,7 +126,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -144,12 +146,15 @@ private: * @param[out] dst Output tensor * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). */ - using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window, + const ThreadInfo &info); - CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr }; - int32_t _k{ 0 }; - int32_t _scalar{ 0 }; - bool _mul_by_scalar{ false }; + CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr}; + int32_t _k{0}; + int32_t _scalar{0}; + bool _mul_by_scalar{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp index a65f1a33de..e290783021 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,32 +45,37 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -80,13 +86,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } @@ -94,9 +102,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto return Status{}; } -void run_offset_contribution(const Window &window, - ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d) +void run_offset_contribution(const Window &window, + ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool slide_vector_sum_col, + bool is_gemm3d) { Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -112,7 +126,7 @@ void run_offset_contribution(const Window &window, const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0; Iterator mm_result_it(mm_result, collapsed_window); - if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true + if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); @@ -131,95 +145,85 @@ void run_offset_contribution(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const size_t batch_offset_col = batch_id * (sum_col_stride_y ); - auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); - - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = - { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - // Add a_offset_term_s32 and b_offset_term_s32 - int32x4x4_t offset_term_s32 = + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = batch_id * (sum_col_stride_y); + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; - - offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); - offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); - offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); - offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); - - int32x4x4_t in_s32 = + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = { + {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; + + offset_term_s32.val[0] = + vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = + vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = + vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = + vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - a_offset_term_s32 *= a_offset; + a_offset_term_s32 *= a_offset; - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; - } - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it); + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; + } + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it); } - else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true + else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -233,54 +237,51 @@ void run_offset_contribution(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); - // Compute the leftover term due to b_offset. - int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); - b_offset_term_s32 *= b_offset; + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = + *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; - const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - int32x4x4_t in_s32 = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); - in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); - in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); - in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += b_offset_term_s32; - } - }, - vector_sum_row_it, mm_result_it); + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += b_offset_term_s32; + } + }, + vector_sum_row_it, mm_result_it); } - else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false + else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false { // Set window for vector_sum_col Window win_vector_sum_col(collapsed_window); @@ -290,69 +291,62 @@ void run_offset_contribution(const Window &window, Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); // Offset in case vector_sum_col is batched - const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const size_t batch_offset_col = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor - auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset); - auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + const int vector_sum_col_batch_offset = + slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) { - // Compute the leftover term due to a_offset. - int32x4x4_t a_offset_term_s32 = + const int batch_id = id.z() / depth_input; + const size_t batch_offset_col = + batch_id * + (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(vector_sum_col_ptr + x + 0), - vld1q_s32(vector_sum_col_ptr + x + 4), - vld1q_s32(vector_sum_col_ptr + x + 8), - vld1q_s32(vector_sum_col_ptr + x + 12) - } - }; - - a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); - a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); - a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); - a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); - - int32x4x4_t in_s32 = + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = { + {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}}; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}}; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for (; x < window_end_x; ++x) { - { - vld1q_s32(mm_result_ptr + x + 0), - vld1q_s32(mm_result_ptr + x + 4), - vld1q_s32(mm_result_ptr + x + 8), - vld1q_s32(mm_result_ptr + x + 12) - } - }; - - // Add the offset terms to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); - - // Store the result with the offset contribution - vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); - vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); - vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); - vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); - } - - // Left-overs loop - for(; x < window_end_x; ++x) - { - // Compute the leftover term due to a_offset. - const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); - - // Add the offset terms to GEMM's result - // Store the result with the offset contribution - mm_result_ptr[x] += a_offset_term_s32 * a_offset; - } - }, - vector_sum_col_it, mm_result_it); + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset; + } + }, + vector_sum_col_it, mm_result_it); } else // false, false { @@ -362,7 +356,12 @@ void run_offset_contribution(const Window &window, } } // namespace -void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_UNUSED(vector_sum_row); @@ -374,7 +373,7 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen _k_offset = a_offset * b_offset * k; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 @@ -387,8 +386,11 @@ void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITen ICpuKernel::configure(win); } -Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, - int32_t a_offset, int32_t b_offset) +Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); return Status{}; @@ -405,11 +407,11 @@ void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Win auto mm_result = tensors.get_tensor(TensorType::ACL_DST); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); + run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, + _slide_vector_sum_col, reinterpret_as_3d); } const char *CpuGemmLowpOffsetContributionKernel::name() const @@ -418,4 +420,4 @@ const char *CpuGemmLowpOffsetContributionKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h index 3514ca811d..08b2d47529 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -63,24 +63,33 @@ public: * @param[in] a_offset Offset to be added to each element of the matrix A. * @param[in] b_offset Offset to be added to each element of the matrix B. */ - void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + void configure(ITensorInfo *mm_result, + ITensorInfo *vector_sum_col, + ITensorInfo *vector_sum_row, + int32_t k, + int32_t a_offset, + int32_t b_offset); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpOffsetContributionKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + int32_t a_offset, + int32_t b_offset); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _slide_vector_sum_col{ true }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k_offset{0}; + bool _slide_vector_sum_col{true}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp index 190487eced..d008842398 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp @@ -31,10 +31,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> @@ -48,80 +49,38 @@ namespace { inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) { - return - { - { - vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12) - } - }; + return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}}; } inline int32x4x4_t load(const int32_t *ptr, int32_t x) { - return - { - { - vld1q_s32(ptr + x + 0), - vld1q_s32(ptr + x + 4), - vld1q_s32(ptr + x + 8), - vld1q_s32(ptr + x + 12) - } - }; + return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}}; } inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) { - return - { - { - vaddq_s32(a.val[0], b), - vaddq_s32(a.val[1], b), - vaddq_s32(a.val[2], b), - vaddq_s32(a.val[3], b) - } - }; + return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}}; } inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) { - return - { - { - vaddq_s32(a.val[0], b.val[0]), - vaddq_s32(a.val[1], b.val[1]), - vaddq_s32(a.val[2], b.val[2]), - vaddq_s32(a.val[3], b.val[3]) - } - }; + return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3])}}; } inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) { - return - { - { - vmulq_n_s32(a.val[0], mul_scalar), - vmulq_n_s32(a.val[1], mul_scalar), - vmulq_n_s32(a.val[2], mul_scalar), - vmulq_n_s32(a.val[3], mul_scalar) - } - }; + return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar)}}; } inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) { - return - { - { - vmulq_s32(a.val[0], vld1q_s32(multilpier)), - vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), - vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), - vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) - } - }; + return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}}; } inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) @@ -144,18 +103,11 @@ inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offse inline int32x4x4_t get_k_offset(int32_t k_offset) { - return - { - { - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset), - vdupq_n_s32(k_offset) - } - }; + return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}}; } -inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) +inline uint8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -172,18 +124,13 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to U8 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_u8 = vmaxq_u8(out_u8, min_u8); out_u8 = vminq_u8(out_u8, max_u8); @@ -192,7 +139,8 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3 return out_u8; } -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -209,18 +157,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -229,7 +172,8 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 return out_s8; } -inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +inline int8x16_t finalize_quantization_floating_point( + int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) { const static int32x4_t zero_s32 = vdupq_n_s32(0); @@ -246,18 +190,13 @@ inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); - if(is_bounded_relu) + if (is_bounded_relu) { out_s8 = vmaxq_s8(out_s8, min_s8); out_s8 = vminq_s8(out_s8, max_s8); @@ -305,81 +244,103 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias) } template <typename VT> -inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, - typename VT::vtype min_vec, typename VT::vtype max_vec, - int32_t a_offset, int32_t b_offset, int32_t k_offset, - int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, + const int32_t *vector_sum_row_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32x4_t result_offset_s32, + const int32x4_t result_shift_s32, + typename VT::vtype min_vec, + typename VT::vtype max_vec, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + int32_t multiplier, + int32_t shift, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_b_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) { - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) { // Combine quantization offset with other offsets. offset_term_s32 = add_s32(offset_term_s32, result_offset_s32); } - if(has_a_offset && has_b_offset) + if (has_a_offset && has_b_offset) { offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset)); } - if(has_b_offset) + if (has_b_offset) { offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset)); } int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - if(has_a_offset) + if (has_a_offset) { in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); } - if(has_bias) + if (has_bias) { in_s32 = add_s32(in_s32, load(bias_ptr, x)); } - if(!is_fixed_point || has_b_offset) + if (!is_fixed_point || has_b_offset) { in_s32 = add_s32(in_s32, offset_term_s32); } - if(!is_fixed_point) + if (!is_fixed_point) { in_s32 = mul_s32(in_s32, multiplier); } - if(is_fixed_point) + if (is_fixed_point) { - wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), - finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); + wrapper::vstore( + reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); } else { - wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), - finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); + wrapper::vstore( + reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); } } // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + int32_t in_value = + *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - if(has_a_offset) + if (has_a_offset) { in_value += (*(vector_sum_col_ptr + x) * a_offset); } - if(has_bias) + if (has_bias) { in_value += *(bias_ptr + x); } - if(is_fixed_point) + if (is_fixed_point) { // Finalize and store the result - *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, - static_cast<typename VT::stype>(min_bound), - static_cast<typename VT::stype>(max_bound), is_bounded_relu); + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = + finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound), + static_cast<typename VT::stype>(max_bound), is_bounded_relu); } else { @@ -387,75 +348,100 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su in_value = (in_value * multiplier) >> shift; // Bound and store the result - if(is_bounded_relu) + if (is_bounded_relu) { - in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + in_value = static_cast<typename VT::stype>( + std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); } - *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()), - std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value))); + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = + static_cast<typename VT::stype>(std::max<int32_t>( + static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()), + std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value))); } } } -inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, - const int32_t *result_multipliers, const int32_t *result_shifts, - const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, - int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, - int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, + const int32_t *bias_ptr, + Iterator mm_result_it, + Iterator out_it, + const int32_t *result_multipliers, + const int32_t *result_shifts, + const int32x4_t result_offset, + int8x16_t min_s8, + int8x16_t max_s8, + int32_t a_offset, + int32_t offset, + int32_t min_bound, + int32_t max_bound, + int window_step_x, + int window_start_x, + int window_end_x, + bool has_a_offset, + bool has_bias, + bool is_bounded_relu, + bool is_fixed_point) { - int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; - if(!is_fixed_point) + int32x4x4_t offset_term_s32 = {0, 0, 0, 0}; + if (!is_fixed_point) { // Combine quantization offset with other offsets. offset_term_s32 = add_s32(offset_term_s32, result_offset); } int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { int32x4x4_t in_s32 = load_results_input(mm_result_it, x); - if(has_a_offset) + if (has_a_offset) { in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); } - if(has_bias) + if (has_bias) { in_s32 = add_s32(in_s32, load(bias_ptr, x)); } - if(!is_fixed_point) + if (!is_fixed_point) { in_s32 = add_s32(in_s32, offset_term_s32); in_s32 = mul_s32(in_s32, result_multipliers + x); } - if(is_fixed_point) + if (is_fixed_point) { - vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu)); + vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), + finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), + result_offset, min_s8, max_s8, is_bounded_relu)); } else { - vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); + vst1q_s8( + reinterpret_cast<int8_t *>(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); } } // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + int32_t in_value = + *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); - if(has_a_offset) + if (has_a_offset) { in_value += (*(vector_sum_col_ptr + x) * a_offset); } - if(has_bias) + if (has_bias) { in_value += *(bias_ptr + x); } - if(is_fixed_point) + if (is_fixed_point) { // Finalize and store the result - *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu); + *(out_it.ptr() + x) = + finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, + static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu); } else { @@ -463,7 +449,7 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); // Bound and store the result - if(is_bounded_relu) + if (is_bounded_relu) { in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); } @@ -473,10 +459,20 @@ inline void run_offset_contribution_output_stage_window_symm(const int32_t *vect } template <typename T> -void run_offset_contribution_output_stage(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +void run_offset_contribution_output_stage(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) { // Semantics of XYZW Explained for each tensor // @@ -516,7 +512,7 @@ void run_offset_contribution_output_stage(const Window &window, Iterator mm_result_it(mm_result, win); Iterator out_it(output, win); - if((a_offset != 0) && (b_offset != 0)) + if ((a_offset != 0) && (b_offset != 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -527,45 +523,52 @@ void run_offset_contribution_output_stage(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), - mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), + mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, + k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, true, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, + result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false, + is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); } } - else if((a_offset == 0) && (b_offset != 0)) + else if ((a_offset == 0) && (b_offset != 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); @@ -573,114 +576,139 @@ void run_offset_contribution_output_stage(const Window &window, const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + false, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_row_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) - + id.y() + (id.z() % depth_input) * height_input; - run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_row_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = + reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>( + nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_row_it, mm_result_it, out_it); } } - else if((a_offset != 0) && (b_offset == 0)) + else if ((a_offset != 0) && (b_offset == 0)) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, - out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, + true, false, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window<Typer>( + vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); } } else { - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>( + nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, + shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false, + true, is_bounded_relu, is_fixed_point); + }, + bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it, - result_offset_s32, result_shift_s32, - min_vec, max_vec, a_offset, b_offset, k_offset, - multiplier, shift, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>( + nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, + max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, + is_fixed_point); + }, + mm_result_it, out_it); } return; } } -void run_offset_contribution_output_stage_symm(const Window &window, - const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, - int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched, - GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, + const ITensor *vector_sum_col, + const ITensor *vector_sum_row, + const ITensor *bias, + ITensor *output, + int32_t a_offset, + int32_t b_offset, + int32_t k_offset, + bool is_vector_sum_col_batched, + GEMMLowpOutputStageInfo output_stage, + bool is_gemm3d, + bool is_bounded_relu, + bool is_fixed_point) { ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); @@ -690,8 +718,8 @@ void run_offset_contribution_output_stage_symm(const Window &window, const int32_t min_bound = output_stage.gemmlowp_min_bound; const int32_t max_bound = output_stage.gemmlowp_max_bound; - const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); - const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); const int32x4_t result_offset_s32 = vdupq_n_s32(offset); const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound)); const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound)); @@ -708,88 +736,105 @@ void run_offset_contribution_output_stage_symm(const Window &window, Iterator mm_result_it(mm_result, win); Iterator out_it(output, win); - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); // Offset in case vector_sum_col is batched in y dimension - const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; + const int vector_sum_col_stride_batch = + is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0; - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, + is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - const int batch_id = id.z() / depth_input; - const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); - run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); - }, - vector_sum_col_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>( + vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch); + run_offset_contribution_output_stage_window_symm( + vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, + window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); } } else { - if(bias != nullptr) + if (bias != nullptr) { Iterator bias_it = get_bias_it(collapsed_window, bias); - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point); - }, - bias_it, mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset, + min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, + is_fixed_point); + }, + bias_it, mm_result_it, out_it); } else { - execute_window_loop(collapsed_window, [&](const Coordinates &) - { - run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, - result_multipliers, result_shifts, - result_offset_s32, min_s8, max_s8, - a_offset, offset, min_bound, max_bound, - window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point); - }, - mm_result_it, out_it); + execute_window_loop( + collapsed_window, + [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm( + nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32, + min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x, + window_end_x, false, false, is_bounded_relu, is_fixed_point); + }, + mm_result_it, out_it); } return; } } -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(output->data_type() != DataType::QASYMM8) + if (output->data_type() != DataType::QASYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && + b_offset != 0); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && + output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -797,7 +842,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); @@ -805,19 +850,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = output->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -828,13 +875,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } @@ -842,7 +891,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3); } - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); @@ -852,15 +901,21 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } } // namespace -void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, +void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_UNUSED(vector_sum_row, bias); // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); _a_offset = a_offset; _b_offset = b_offset; @@ -868,7 +923,7 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo _output_stage = output_stage; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { // Check if vector_sum_col_shape should be slidden or not // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 @@ -888,16 +943,24 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo ICpuKernel::configure(win); } -Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, - const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, - int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output, + int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); return Status{}; } -void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -912,14 +975,14 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors PixelValue type_min{}; PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); - int32_t type_min_int = type_min.get<int32_t>(); - int32_t type_max_int = type_max.get<int32_t>(); + int32_t type_min_int = type_min.get<int32_t>(); + int32_t type_max_int = type_max.get<int32_t>(); - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->info()->num_dimensions() > 1 - && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 && + mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); - const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); + const bool is_bounded_relu = + !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); // Check if we need to perform fixed point requantization const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; @@ -930,22 +993,25 @@ void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors // Check if symmetric per-channel execution const bool is_symm = _output_stage.is_quantized_per_channel; - if(is_symm) + if (is_symm) { - run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, + _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, + _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { - if(is_signed) + if (is_signed) { - run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage<int8_t>( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } else { - run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage, - reinterpret_as_3d, is_bounded_relu, is_fixed_point); + run_offset_contribution_output_stage<uint8_t>( + window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, + _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point); } } } diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h index 3cb99faee8..af477d4756 100644 --- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -85,7 +86,13 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. */ - void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, + void configure(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage); /** Static function to check if given info will lead to a valid configuration @@ -94,21 +101,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: /** Function to use for the particular tensors passed to configure() */ - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - int32_t _k_offset{ 0 }; - bool _is_vector_sum_col_batched{ true }; - GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + int32_t _k_offset{0}; + bool _is_vector_sum_col_batched{true}; + GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp index 3023d93113..eefc294700 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -28,13 +28,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/AccessWindowStatic.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <arm_neon.h> @@ -46,26 +47,35 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) || + output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED)) + if (dst->data_type() != output_stage->output_data_type && + (output_stage->output_data_type == DataType::QASYMM8 || + output_stage->output_data_type == DataType::QASYMM8_SIGNED)) { ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); } @@ -92,24 +102,26 @@ inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_ } template <typename T> -inline typename std::enable_if<std::is_same<T, uint8_t>::value, - typename wrapper::traits::neon_vector<T, 16>::type>::type - convert_to_8bit(const int16x8x2_t in_s16) +inline + typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type + convert_to_8bit(const int16x8x2_t in_s16) { return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); } template <typename T> -inline typename std::enable_if<std::is_same<T, int8_t>::value, - typename wrapper::traits::neon_vector<T, 16>::type>::type - convert_to_8bit(const int16x8x2_t in_s16) +inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type +convert_to_8bit(const int16x8x2_t in_s16) { return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); } template <typename T> -inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min, - typename wrapper::traits::neon_vector<T, 16>::type max) +inline typename wrapper::traits::neon_vector<T, 16>::type +finalize_quantization(int32x4x4_t &in_s32, + int32x4_t result_shift_s32, + typename wrapper::traits::neon_vector<T, 16>::type min, + typename wrapper::traits::neon_vector<T, 16>::type max) { // Shift final result (negative value shift right) in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); @@ -118,13 +130,8 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization( in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); // Convert S32 to S16 - const int16x8x2_t in_s16 = - { - { - vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), - vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) - } - }; + const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}}; // Convert S16 to S8 or U8 typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16); @@ -137,7 +144,10 @@ inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization( } // namespace template <typename T> -void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { using VectorType = typename wrapper::traits::neon_vector<T, 16>::type; @@ -159,107 +169,105 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, c Iterator in(src, win); Iterator out(dst, win); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), + finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x); - int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); - - // Quantize - in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; - - // Store the result - *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); - } - }, - in, bias_i, out); + const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x); + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + + // Quantize + in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * + _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, bias_i, out); } else { - execute_window_loop(win, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - // Add the offset terms to GEMM's result and multiply by result_mult_int - scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); - - wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), + finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); - // Quantize - in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; + // Quantize + in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> + _output_stage->gemmlowp_shift; - // Store the result - *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); - } - }, - in, out); + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -268,10 +276,7 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - bias, - dst, - output_stage)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); _output_stage = output_stage; @@ -281,14 +286,17 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso ICpuKernel::configure(win); // Check if we need to clamp the result using min and max - _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) - && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); - if(_output_stage->output_data_type == DataType::QASYMM8) + _is_bounded_relu = + ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) && + !(_output_stage->gemmlowp_min_bound == + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) && + _output_stage->gemmlowp_max_bound == + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); + if (_output_stage->output_data_type == DataType::QASYMM8) { _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>; } - else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED) + else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED) { _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>; } @@ -298,7 +306,10 @@ void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITenso } } -Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); return Status{}; @@ -323,4 +334,4 @@ const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h index c7813edcd7..33e296b251 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -71,10 +72,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -95,11 +99,14 @@ private: * @param[out] dst Output tensor info * @param[in] window Region on which to execute the kernel. */ - using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - const GEMMLowpOutputStageInfo *_output_stage{ nullptr }; - bool _is_bounded_relu{ false }; + QuantizeDownFunctionPtr _func{nullptr}; + const GEMMLowpOutputStageInfo *_output_stage{nullptr}; + bool _is_bounded_relu{false}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp index 53ca991889..a5c09c9977 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NESymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NESymm.h" #include <arm_neon.h> @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template <bool is_bounded_relu> -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min)); const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max)); @@ -88,92 +92,92 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(co Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x2_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4) - } - }; + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}}; - const int32x4x2_t bias_s32 = - { - { - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4) - } - }; + const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}}; - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, + finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), - static_cast<int16_t>(_max)); - } - }, - in, out, bias_i); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x2_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4) - } - }; + int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}}; - vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); - } + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, + finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, + _result_shift, min_s16, max_s16)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - ARM_COMPUTE_UNUSED(in_value); - // Finalize and store the result - *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), - static_cast<int16_t>(_max)); - } - }, - in, out); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + ARM_COMPUTE_UNUSED(in_value); + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>( + in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int min, int max) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min, + int max) { // Perform validate step ARM_COMPUTE_UNUSED(bias, dst); @@ -193,18 +197,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITens // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= -32768 && max >= 32767); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> : - &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>; } -Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h index 681d099695..925788b680 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -48,7 +49,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. * */ -class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel> +class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel> { public: CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; @@ -65,17 +67,24 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -97,13 +106,13 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp index 27214dcb5a..0e58097073 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" #include <arm_neon.h> @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template <bool is_bounded_relu> -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(_min)); @@ -88,102 +92,102 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(con Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); - } - }, - in, out, bias_i); + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), - finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Finalize and store the result - *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, - static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); - } - }, - in, out); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization( + in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -205,18 +209,21 @@ void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITenso // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= -128 && max >= 127); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> : - &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>; } -Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h index 3e615b935e..6a67ba4f19 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -49,7 +50,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. * */ -class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel> +class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel> { public: CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; @@ -67,17 +69,25 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -99,14 +109,14 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index e49fd29115..e3dd2240ca 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -29,12 +29,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/NEAsymm.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" #include <arm_neon.h> @@ -53,14 +54,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ARM_COMPUTE_RETURN_ERROR_ON(min > max); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); @@ -71,7 +72,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const } // namespace template <bool is_bounded_relu> -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, + const ITensor *bias, + ITensor *dst, + const Window &window) { const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min)); @@ -89,98 +93,102 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(co Iterator in(src, win_collapsed); Iterator out(dst, win_collapsed); - if(bias != nullptr) + if (bias != nullptr) { Window win_biases; win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias_i(bias, win_biases); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - const int32x4x4_t bias_s32 = + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + const int32x4x4_t bias_s32 = { + {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}}; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) - } - }; - - // Add the bias to GEMM's result - in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); - in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); - in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); - in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); - int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Add bias - in_value += bias_value; - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu); - } - }, - in, out, bias_i); + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast<uint8_t>(_min), + static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); } else { - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - // Compute 16 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - int32x4x4_t in_s32 = + // Compute 16 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), - vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) - } - }; - - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); - - // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu); - } - }, - in, out); + int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}}; + + vst1q_u8(out.ptr() + x, + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, + result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, + _result_offset_after_shift, static_cast<uint8_t>(_min), + static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out); } } -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min, + int max) { ARM_COMPUTE_UNUSED(bias); // Perform validate step @@ -202,18 +210,21 @@ void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITens // Check if we need to clamp the result using min and max const bool is_bounded_relu = !(min <= 0 && max >= 255); - _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> : - &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>; + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> + : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>; } -Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); return Status{}; } -void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); @@ -233,4 +244,4 @@ const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() c } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h index b773fdfdcf..45bd742a70 100644 --- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -49,7 +50,8 @@ namespace kernels * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. * */ -class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel> +class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel> { public: CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; @@ -67,17 +69,25 @@ public: * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, * Along with @p min, this value can be used to implement "rectified linear unit" activation functions */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + void configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + int result_fixedpoint_multiplier, + int result_shift, + int result_offset_after_shift, + int min = 0, + int max = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + static Status + validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -93,14 +103,14 @@ private: * @param[in] window Region on which to execute the kernel. */ using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)( - const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); - QuantizeDownFunctionPtr _func{ nullptr }; - int _result_fixedpoint_multiplier{ 0 }; - int _result_shift{ 0 }; - int _result_offset_after_shift{ 0 }; - int _min{ 0 }; - int _max{ 0 }; + QuantizeDownFunctionPtr _func{nullptr}; + int _result_fixedpoint_multiplier{0}; + int _result_shift{0}; + int _result_offset_after_shift{0}; + int _min{0}; + int _max{0}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp index 6399ebbef4..fb1b70b91f 100644 --- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" #include "src/cpu/kernels/gemm_matrix_add/list.h" namespace arm_compute { @@ -40,24 +41,12 @@ namespace kernels { namespace { -static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = -{ - { - "neon_fp32_gemm_matrix_add", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add) - }, - { - "neon_fp16_gemm_matrix_add", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add) - }, +static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = { + {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)}, + {"neon_fp16_gemm_matrix_add", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)}, }; } // namespace @@ -71,7 +60,8 @@ void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta)); _beta = beta; - const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuGemmMatrixAdditionKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _func = uk->ukernel; // Configure kernel window @@ -87,7 +77,7 @@ Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITens ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -105,7 +95,7 @@ void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &win const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - if(_beta != 0.0f) + if (_beta != 0.0f) { (*_func)(src, dst, window, _beta); } @@ -116,7 +106,8 @@ const char *CpuGemmMatrixAdditionKernel::name() const return "CpuGemmMatrixAdditionKernel"; } -const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &CpuGemmMatrixAdditionKernel::get_available_kernels() +const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> & +CpuGemmMatrixAdditionKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h index cbc5b53087..5e12f1dcbd 100644 --- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h @@ -75,7 +75,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector<GemmMatrixAddKernel> &get_available_kernels(); @@ -89,8 +89,8 @@ private: * @param[in] beta Weight of matrix C */ /** Matrix addition function to use for the particular tensor types passed to configure() */ - GemmMatrixAddKernelPtr _func{ nullptr }; - float _beta{ 0.f }; + GemmMatrixAddKernelPtr _func{nullptr}; + float _beta{0.f}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp index 03b372efd4..beccd94844 100644 --- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp @@ -26,10 +26,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/gemm_matrix_mul/list.h" @@ -42,27 +43,20 @@ namespace kernels { namespace { -static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = -{ - { - "neon_fp32_gemm_matrix_mul", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F32); - }, - REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul) - }, - { - "neon_fp16_gemm_matrix_mul", - [](const DataTypeISASelectorData & data) - { - return (data.dt == DataType::F16) && data.isa.fp16; - }, - REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul) - }, +static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = { + {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)}, + {"neon_fp16_gemm_matrix_mul", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)}, }; -inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +inline Status validate_arguments(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_UNUSED(alpha); @@ -70,11 +64,11 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - if(!is_interleaved) + if (!is_interleaved) { ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1)); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1)); @@ -90,28 +84,31 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); /* Interleave */ - TensorShape tensor_shape0{ lhs->tensor_shape() }; + TensorShape tensor_shape0{lhs->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0); - const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape( + misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0); - if(n != 0) /* Transpose */ + if (n != 0) /* Transpose */ { - TensorShape tensor_shape1{ rhs->tensor_shape() }; + TensorShape tensor_shape1{rhs->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape( + tensor_info1, mult_transpose1xW_width)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - if(n != 0) + if (n != 0) { ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n)); } @@ -125,12 +122,17 @@ inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, } // namespace -void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); // dst tensor auto inizialitation if not yet initialized - TensorShape tensor_shape{ lhs->tensor_shape() }; + TensorShape tensor_shape{lhs->tensor_shape()}; tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0)); tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1)); @@ -146,7 +148,7 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication const bool is_dst_vector = (dst->dimension(1) == 1); - if(is_dst_vector) + if (is_dst_vector) { const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32; @@ -157,17 +159,23 @@ void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITenso constexpr unsigned int num_elems_processed_per_iteration_x = 8; constexpr unsigned int num_elems_processed_per_iteration_y = 4; - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); } - const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation( + DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _func = uk->ukernel; ICPPKernel::configure(win); } -Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, +Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, const GEMMReshapeInfo &reshape_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); @@ -195,7 +203,8 @@ const char *CpuGemmMatrixMultiplyKernel::name() const return "CpuGemmMatrixMultiplyKernel"; } -const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &CpuGemmMatrixMultiplyKernel::get_available_kernels() +const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> & +CpuGemmMatrixMultiplyKernel::get_available_kernels() { return available_kernels; } diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h index a7dfec87bd..765fcb8275 100644 --- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h @@ -42,7 +42,8 @@ namespace kernels class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel> { private: - using GemmMatrixMulKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type; + using GemmMatrixMulKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type; public: struct GemmMatrixMulKernel @@ -67,17 +68,27 @@ public: * @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped */ - void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); + void configure(const ITensorInfo *lhs, + const ITensorInfo *rhs, + ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel * * Similar to @ref CpuGemmMatrixMultiplyKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + float alpha, + bool is_interleaved, + const GEMMReshapeInfo &reshape_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; static const std::vector<GemmMatrixMulKernel> &get_available_kernels(); @@ -94,8 +105,8 @@ private: */ /** Matrix multiply function to use for the particular tensor types passed to configure() */ - GemmMatrixMulKernelPtr _func{ nullptr }; - float _alpha{ 1.f }; + GemmMatrixMulKernelPtr _func{nullptr}; + float _alpha{1.f}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp index 62d5d5f5e9..c47746bc4b 100644 --- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp @@ -24,9 +24,10 @@ #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -63,9 +64,10 @@ Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + compute_transpose1xW_with_element_size_shape(*src)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -107,25 +109,28 @@ void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &windo const size_t out_stride = dst->info()->strides_in_bytes()[1]; const size_t vector_size = 16 / element_size; - execute_window_loop(window, [&](const Coordinates & id) - { - const uint8_t *in_ptr = in.ptr(); - uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; - - for(size_t k = 0; k < vector_size; ++k) + execute_window_loop( + window, + [&](const Coordinates &id) { - // If the src width is not multiple of W, we fill the reference with 0s - if((id.x() + k) >= in_width) - { - std::memset(out_ptr + k * element_size, 0, element_size); - } - else + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = + out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; + + for (size_t k = 0; k < vector_size; ++k) { - std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + // If the src width is not multiple of W, we fill the reference with 0s + if ((id.x() + k) >= in_width) + { + std::memset(out_ptr + k * element_size, 0, element_size); + } + else + { + std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + } } - } - }, - in, out); + }, + in, out); } const char *CpuGemmTranspose1xWKernel::name() const diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h index 0ca92641b7..4b834b2cc6 100644 --- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h @@ -88,7 +88,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp index 9ac291549b..55ac7c5192 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.cpp +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -29,13 +29,13 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - #include <arm_neon.h> #include <cstddef> #include <cstdint> @@ -51,26 +51,34 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon"); // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions - const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); + const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - if(output->total_size() > 0) + if (output->total_size() > 0) { - TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); + TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape( + input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -106,14 +114,14 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, // This for loop linearize a volume with 3 slices. This allows: // 1) to reduce the iterations of the outer for loop "d" // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs - for(; d <= (kernel_depth - 3); d += 3) + for (; d <= (kernel_depth - 3); d += 3) { - for(int y = top_left_y; y < y_e; y += dilation_y) + for (int y = top_left_y; y < y_e; y += dilation_y) { - if((y < 0 || y >= input_h) && has_pads) + if ((y < 0 || y >= input_h) && has_pads) { // All the values will be the offset (will be zeros when not quantized) - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { *(out_ptr + 0 * kernel_size2) = pad_value; *(out_ptr + 1 * kernel_size2) = pad_value; @@ -122,9 +130,9 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { - if((x < 0 || x >= input_w) && has_pads) + if ((x < 0 || x >= input_w) && has_pads) { *(out_ptr + 0 * kernel_size2) = pad_value; *(out_ptr + 1 * kernel_size2) = pad_value; @@ -132,9 +140,12 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); - *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); } } } @@ -143,11 +154,11 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } // Left over - for(; d < kernel_depth; d++) + for (; d < kernel_depth; d++) { - for(int y = top_left_y; y < y_e; y += dilation_y) + for (int y = top_left_y; y < y_e; y += dilation_y) { - if((y < 0 || y >= input_h) && has_pads) + if ((y < 0 || y >= input_h) && has_pads) { // All the values will be the offset (will be zeros when not quantized) memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T)); @@ -155,15 +166,16 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } else { - for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) { - if((x < 0 || x >= input_w) && has_pads) + if ((x < 0 || x >= input_w) && has_pads) { *out_ptr = pad_value; } else { - *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + *out_ptr = *(reinterpret_cast<const T *>( + in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); } } } @@ -171,7 +183,7 @@ inline void linearize_volume_nchw(const uint8_t *const in_ptr, } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast<T>(1); } @@ -198,36 +210,39 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, const int end_y = start_y + kernel_height * dilation_y; const int pad_quant = kernel_width * input_c; const int element_size = static_cast<int>(sizeof(T)); - if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size)) + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == input_c * element_size)) { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); out_ptr += input_c * kernel_width; } } else { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { - if(y < 0 || y >= input_h) + if (y < 0 || y >= input_h) { memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); out_ptr += pad_quant; } - else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) { - for(int x = start_x; x < end_x; x += dilation_x) + for (int x = start_x; x < end_x; x += dilation_x) { - if(x < 0 || x >= input_w) + if (x < 0 || x >= input_w) { memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size); out_ptr += input_c; } else { - memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size); + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + input_c * element_size); out_ptr += input_c; } } @@ -235,13 +250,14 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, else { //optimized for no dilation and no boundary pixels - memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); out_ptr += input_c * kernel_width; } } } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast<T>(1); } @@ -271,12 +287,13 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, const int element_size = static_cast<int>(sizeof(T)); const int channel_chunk_size = input_c * element_size; - if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size)) + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == channel_chunk_size)) { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); - for(int e = 0; e < kernel_width; e++) + for (int e = 0; e < kernel_width; e++) { memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); out_ptr += input_c + pad_right; @@ -285,25 +302,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, } else { - for(int y = start_y; y < end_y; y += dilation_y) + for (int y = start_y; y < end_y; y += dilation_y) { - if(y < 0 || y >= input_h) + if (y < 0 || y >= input_h) { memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); out_ptr += pad_quant; } - else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) { - for(int x = start_x; x < end_x; x += dilation_x) + for (int x = start_x; x < end_x; x += dilation_x) { - if(x < 0 || x >= input_w) + if (x < 0 || x >= input_w) { memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size); out_ptr += input_c + pad_right; } else { - memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size); + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + channel_chunk_size); out_ptr += input_c + pad_right; } } @@ -311,16 +329,17 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr, else { const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); - for(int e = 0; e < kernel_width; e++) + for (int e = 0; e < kernel_width; e++) { - memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), + channel_chunk_size); out_ptr += input_c + pad_right; } } } } // Append 1 if the convolution layer has biases - if(has_bias) + if (has_bias) { *out_ptr = static_cast<T>(1); } @@ -348,7 +367,8 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window const int pad_top = _conv_info.pad_top(); const int stride_x = _conv_info.stride().first; const int stride_y = _conv_info.stride().second; - const int pad_value = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; + const int pad_value = + is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; Window window_in_out(window); // The first three dimensions of the input and output are increased by the inner loops @@ -361,84 +381,57 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window Iterator out(dst, window_in_out); execute_window_loop( - window, [&](const Coordinates & id) - { - const int start_w = id[width_idx] * stride_x - pad_left; - const int start_h = id[height_idx] * stride_y - pad_top; + window, + [&](const Coordinates &id) + { + const int start_w = id[width_idx] * stride_x - pad_left; + const int start_h = id[height_idx] * stride_y - pad_top; - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - auto output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y()); + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = + reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * + dst->info()->strides_in_bytes().y()); - // Linearize volume - if(is_nchw) - { - linearize_volume_nchw<T, has_pads>(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_c, - input_w, - input_h, - input_stride_x, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); - } - else - { - if(_input_pad_right > 0) + // Linearize volume + if (is_nchw) { - linearize_volume_nhwc<T, has_pads>(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y(), - _input_pad_right); + linearize_volume_nchw<T, has_pads>( + input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w, + input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y()); } else { - linearize_volume_nhwc<T, has_pads>(input_ptr, - output_ptr, - _has_bias, - start_w, - start_h, - _kernel_width, - _kernel_height, - input_w, - input_h, - input_c, - input_stride_y, - input_stride_z, - pad_value, - _dilation.x(), - _dilation.y()); + if (_input_pad_right > 0) + { + linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, _has_bias, start_w, start_h, + _kernel_width, _kernel_height, input_w, input_h, input_c, + input_stride_y, input_stride_z, pad_value, _dilation.x(), + _dilation.y(), _input_pad_right); + } + else + { + linearize_volume_nhwc<T, has_pads>( + input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w, + input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y()); + } } - } - }, - in, out); + }, + in, out); } -void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +void CpuIm2ColKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); ARM_COMPUTE_UNUSED(num_groups); _data_layout = src->data_layout(); @@ -451,31 +444,34 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const _kernel_height = kernel_dims.height; _input_pad_right = input_pad_right; _dilation = dilation; - _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), - _kernel_width, _kernel_height, - _conv_info, _dilation); + _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width, + _kernel_height, _conv_info, _dilation); _has_bias = has_bias; - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> + : &CpuIm2ColKernel::run_im2col<float, true, true>; break; #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> + : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>; break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> + : &CpuIm2ColKernel::run_im2col<float16_t, true, true>; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QASYMM8_SIGNED: case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> + : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>; break; default: ARM_COMPUTE_ERROR("Data type not supported"); @@ -484,26 +480,31 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } else { - switch(src->data_type()) + switch (src->data_type()) { case DataType::F32: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> + : &CpuIm2ColKernel::run_im2col<float, true, false>; break; #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> + : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>; break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> + : &CpuIm2ColKernel::run_im2col<float16_t, true, false>; break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QASYMM8: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> + : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; break; case DataType::QASYMM8_SIGNED: - _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> + : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; break; default: ARM_COMPUTE_ERROR("Data type not supported"); @@ -512,11 +513,13 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const } // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, + false, num_groups, _input_pad_right))); - std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), - kernel_dims.width, kernel_dims.height, - conv_info, dilation); + std::pair<unsigned int, unsigned int> convolved_dims = + scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height, + conv_info, dilation); Window win = calculate_max_window(*src, Steps()); win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1)); @@ -526,10 +529,17 @@ void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const ICpuKernel::configure(win); } -Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right) +Status CpuIm2ColKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups, + unsigned int input_pad_right) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right)); return Status{}; } diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h index d133f8dc2d..2cb26179ce 100644 --- a/src/cpu/kernels/CpuIm2ColKernel.h +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_IM2COL_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -78,16 +79,28 @@ public: * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); + void configure(const ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuIm2ColKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, - bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1, + unsigned int input_pad_right = 0); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -117,15 +130,15 @@ private: */ using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window); - Im2ColFunctionPtr _func{ nullptr }; + Im2ColFunctionPtr _func{nullptr}; std::pair<unsigned int, unsigned int> _convolved_dims{}; PadStrideInfo _conv_info{}; - unsigned int _kernel_width{ 0 }; - unsigned int _kernel_height{ 0 }; - unsigned int _input_pad_right{ 0 }; - bool _has_bias{ false }; - Size2D _dilation{ 1U, 1U }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; + unsigned int _kernel_width{0}; + unsigned int _kernel_height{0}; + unsigned int _input_pad_right{0}; + bool _has_bias{false}; + Size2D _dilation{1U, 1U}; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h index 39adc9af7c..b7daa4d583 100644 --- a/src/cpu/kernels/CpuKernelSelectionTypes.h +++ b/src/cpu/kernels/CpuKernelSelectionTypes.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H #include "arm_compute/core/Types.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" namespace arm_compute @@ -78,10 +79,10 @@ struct DepthwiseConv2dNativeDataTypeISASelectorData struct ActivationDataTypeISASelectorData { - DataType dt; - const CPUModel &cpumodel; - const cpuinfo::CpuIsaInfo &isa; - const ActivationFunction f; + DataType dt; + const CPUModel &cpumodel; + const cpuinfo::CpuIsaInfo &isa; + const ActivationFunction f; }; struct CpuAddKernelDataTypeISASelectorData @@ -99,15 +100,19 @@ struct ScaleKernelDataTypeISASelectorData }; // Selector pointer types -using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type; -using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type; -using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type; -using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type; -using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type; -using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type; -using ActivationDataTypeISASelectorDataPtr = std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type; -using CpuAddKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type; -using ScaleKernelDataTypeISASelectorDataPtr = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; +using DataTypeISASelectorPtr = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type; +using DataTypeDataLayoutSelectorPtr = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type; +using PoolDataTypeISASelectorPtr = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type; +using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type; +using DepthwiseConv2dNativeDataTypeISASelectorPtr = + std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type; +using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type; +using ActivationDataTypeISASelectorDataPtr = + std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type; +using CpuAddKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type; +using ScaleKernelDataTypeISASelectorDataPtr = + std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp index 7d077c75bf..bcaa76b99b 100644 --- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp @@ -24,11 +24,12 @@ #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/maxunpool/list.h" @@ -43,50 +44,43 @@ using namespace misc::shape_calculator; namespace { -static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = -{ - { - "neon_fp32_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(neon_fp32_maxunpooling) - }, - { - "neon_fp16_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_maxunpooling) - }, - { - "neon_qu8_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling) - }, - { - "neon_qs8_maxunpooling", - [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling) - }, +static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = { + {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(neon_fp32_maxunpooling)}, + {"neon_fp16_maxunpooling", + [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_maxunpooling)}, + {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)}, + {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices); - int pool_stride_x = 0; - int pool_stride_y = 0; - PoolingType pool_type = pool_info.pool_type; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + int pool_stride_x = 0; + int pool_stride_y = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_size_x = pool_info.pool_size.width; - const int pool_size_y = pool_info.pool_size.height; + const int pool_size_x = pool_info.pool_size.width; + const int pool_size_y = pool_info.pool_size.height; const Size2D pool_size(pool_size_x, pool_size_y); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -96,13 +90,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, co } } // namespace -void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info) +void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info)); ARM_COMPUTE_UNUSED(indices); - const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; @@ -113,7 +111,10 @@ void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensor ICpuKernel::configure(window); } -Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info)); diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h index d0c13471c8..5a641a2bea 100644 --- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h +++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel> { private: - using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type; + using MaxUnpoolingUKernelPtr = std::add_pointer<void( + const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type; public: /** Default constructor */ @@ -56,7 +57,8 @@ public: * @param[out] dst Destination tensor. Data types supported: Same as @p src * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel * * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -66,7 +68,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -83,7 +88,7 @@ public: const char *name() const override; private: - MaxUnpoolingUKernelPtr _run_method{ nullptr }; + MaxUnpoolingUKernelPtr _run_method{nullptr}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp index b73d2bdf73..ba086e3ac6 100644 --- a/src/cpu/kernels/CpuMulKernel.cpp +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -25,23 +25,24 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" + #include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" #include <arm_neon.h> namespace { #if defined(ENABLE_FP32_KERNELS) - static constexpr size_t default_mws_N1_fp32_neon = 22447; - static constexpr size_t default_mws_V1_fp32_neon = 38982; +static constexpr size_t default_mws_N1_fp32_neon = 22447; +static constexpr size_t default_mws_V1_fp32_neon = 38982; #endif /* ENABLE_FP32_KERNELS */ - static constexpr size_t default_mws_other_platforms_1d_tensor = 10240; -} +static constexpr size_t default_mws_other_platforms_1d_tensor = 10240; +} // namespace namespace arm_compute { namespace cpu @@ -54,29 +55,38 @@ const float scale255_constant = 1.f / 255.f; const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); -inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +inline Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, + DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::S32, DataType::F16, DataType::F32); - if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) + if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, + "ConvertPolicy cannot be WRAP if datatype is quantized"); } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // clang-format off ARM_COMPUTE_RETURN_ERROR_ON_MSG( @@ -88,13 +98,17 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) , "Invalid data type combination"); // clang-format on - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && + scale != 1.f, + "Unsupported scale for QSYMM16 inputs and S32 dst"); } - if(std::abs(scale - scale255_constant) < 0.00001f) + if (std::abs(scale - scale255_constant) < 0.00001f) { - ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && + rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && + dst->data_type() == DataType::S32, "Scale == 1/255 is not supported if input and dst are of data type S32"); } else @@ -107,7 +121,8 @@ inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 // Moreover, it will be negative as we deal with 1/2^n - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), + "Scale value not supported (Should be 1/(2^n) or 1/255"); } return Status{}; @@ -168,9 +183,9 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -178,7 +193,7 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -190,52 +205,52 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type; execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); - - const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); - const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const float32x4x4_t out_f32x4x4 = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); + const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(non_broadcast_input_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo); - const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(non_broadcast_input_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -251,56 +266,59 @@ void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto input1_q = wrapper::vloadq(input1_ptr + x); - const auto input2_q = wrapper::vloadq(input2_ptr + x); - - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); - const float32x4x4_t out_f32x4x4 = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - // Quantize dst - const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); - wrapper::vstore(output_ptr + x, result); - } + const auto input1_q = wrapper::vloadq(input1_ptr + x); + const auto input2_q = wrapper::vloadq(input2_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - const T src1 = *(input1_ptr + x); - const T src2 = *(input2_ptr + x); - const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info); - const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info); - const float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst - const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(input1_ptr + x); + const T src2 = *(input2_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); } } -bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale) +bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + float scale) { const auto iq0 = src0->quantization_info().uniform(); const auto iq1 = src1->quantization_info().uniform(); @@ -308,7 +326,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale; - if(multiplier < -8191.f || multiplier > 8191.f) + if (multiplier < -8191.f || multiplier > 8191.f) { //The multiplier cannot be stored as a 14.18 signed fixed-point number return false; @@ -318,7 +336,7 @@ bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo const auto max_result = multiplier * (256) * (256) + offset_out; - if(max_result > 8191.f) + if (max_result > 8191.f) { //It might not be possible to store the result as a 14.18 signed fixed-point number. return false; @@ -366,7 +384,7 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i); const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Prefix: a = non-broadcast, b = broadcast. @@ -392,78 +410,76 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); - const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); - - const auto b_val = *b_ptr; - const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0); - const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag()); + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); - const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); - const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); + const auto b_val = *b_ptr; + const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0); + const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag()); - int x = window_start_x; + const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag()); + const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag()); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); - - // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. - const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); - const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); - - const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0); - const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0); - const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0); - const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0); - - const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0); - const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0); - const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0); - const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0); - - const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); - const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); - const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); - const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); - - // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. - const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); - const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); - const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); - const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); - - const auto vout_15p1_0 = wrapper::vcombine( - vout_15p1_00, - vout_15p1_01); - - const auto vout_15p1_1 = wrapper::vcombine( - vout_15p1_10, - vout_15p1_11); - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + int x = window_start_x; - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<2>(vout_15p1_0), - wrapper::vqrshrn<2>(vout_15p1_1)); - wrapper::vstore(out_ptr + x, vout_8p0); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0); + const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0); + const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01); + + const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } - //Process the left-over elements. - for(; x < window_end_x; ++x) - { + //Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t( - b_val) - b_offset_16p0)) + out_offset_14p18))); + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>( + (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) + + out_offset_14p18))); #else //__aarch64__ - out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset))); + out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround( + multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset))); #endif //__aarch64__ - } - }, - a_input_it, b_input_it, out_it); + } + }, + a_input_it, b_input_it, out_it); } else { @@ -481,82 +497,83 @@ void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *d Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); - const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + win, + [&](const Coordinates &) + { + const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); - int x = window_start_x; + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); - const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); - - // Widen the input elements to signed 16-bit regardless of the input signedness. - const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); - const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); - const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); - const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); - - const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0); - const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0); - const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0); - const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0); - - const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0); - const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0); - const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0); - const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0); - - const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00); - const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01); - const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10); - const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11); - - const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); - const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); - const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); - const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); - - // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. - const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); - const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); - const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); - const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); - - const auto vout_14p2_0 = wrapper::vcombine( - vout_14p2_00, - vout_14p2_01); - - const auto vout_14p2_1 = wrapper::vcombine( - vout_14p2_10, - vout_14p2_11); - - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<2>(vout_14p2_0), - wrapper::vqrshrn<2>(vout_14p2_1)); - wrapper::vstore(out_ptr + x, vout_8p0); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0); + const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0); + const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0); + + const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0); + const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0); + const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0); + + const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00); + const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01); + const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10); + const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11); + + const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18); + const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18); + const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18); + const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18); + + // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this. + const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00)); + const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01)); + const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10)); + const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11)); + + const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01); + + const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11); + + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1)); + wrapper::vstore(out_ptr + x, vout_8p0); + } - //Process the left-over elements. - for(; x < window_end_x; ++x) - { + //Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t( - in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18))); + out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>( + wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * + (int32_t(in1_ptr[x]) - in1_offset_16p0)) + + out_offset_14p18))); #else //__aarch64__ - out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset))); + out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround( + multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + + float(out_offset))); #endif //__aarch64__ - } - }, - in0_it, in1_it, out_it); + } + }, + in0_it, in1_it, out_it); } } -void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +void mul_saturate_QSYMM16_QSYMM16_QSYMM16( + const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) { const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); @@ -580,66 +597,61 @@ void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *sr const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset}; execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const qsymm16x8x2_t input1_q = + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const qsymm16x8x2_t input1_q = {{ vld1q_s16(input1_ptr + x), vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { + }}; + const qsymm16x8x2_t input2_q = {{ vld1q_s16(input2_ptr + x), vld1q_s16(input2_ptr + x + 8), - } - }; + }}; - // Dequantize inputs - const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); - const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); - const float32x4x4_t out_f32x4x4 = - { - vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), - vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), - vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), - vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), - }; - - const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const float32x4x4_t out_f32x4x4 = { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Dequantize inputs - float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale; - float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale; - float tmp_f = tmp_in1 * tmp_in2; - - // Quantize dst, lrintf() has same rounding mode as vcombine_s16 - int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); - qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - *(output_ptr + x) = tmp_qua; - } - }, - input1, input2, dst); + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + // Dequantize inputs + float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale; + float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale; + float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst, lrintf() has same rounding mode as vcombine_s16 + int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); + qsymm16_t tmp_qua = + static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); } void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) @@ -665,74 +677,60 @@ void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor * const auto window_end_x = static_cast<int>(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const qsymm16x8x2_t input1_q = + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const qsymm16x8x2_t input1_q = {{ vld1q_s16(input1_ptr + x), vld1q_s16(input1_ptr + x + 8), - } - }; - const qsymm16x8x2_t input2_q = - { - { + }}; + const qsymm16x8x2_t input2_q = {{ vld1q_s16(input2_ptr + x), vld1q_s16(input2_ptr + x + 8), - } - }; + }}; - const int32x4x4_t in1_s32 = - { - { + const int32x4x4_t in1_s32 = {{ vmovl_s16(vget_low_s16(input1_q.val[0])), vmovl_s16(vget_high_s16(input1_q.val[0])), vmovl_s16(vget_low_s16(input1_q.val[1])), vmovl_s16(vget_high_s16(input1_q.val[1])), - } - }; - const int32x4x4_t in2_s32 = - { - { + }}; + const int32x4x4_t in2_s32 = {{ vmovl_s16(vget_low_s16(input2_q.val[0])), vmovl_s16(vget_high_s16(input2_q.val[0])), vmovl_s16(vget_low_s16(input2_q.val[1])), vmovl_s16(vget_high_s16(input2_q.val[1])), - } - }; + }}; - const int32x4x4_t result = - { - { + const int32x4x4_t result = {{ vmulq_s32(in1_s32.val[0], in2_s32.val[0]), vmulq_s32(in1_s32.val[1], in2_s32.val[1]), vmulq_s32(in1_s32.val[2], in2_s32.val[2]), vmulq_s32(in1_s32.val[3], in2_s32.val[3]), - } - }; + }}; - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - vst1q_s32(output_ptr + x + 8, result.val[2]); - vst1q_s32(output_ptr + x + 12, result.val[3]); - } + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + vst1q_s32(output_ptr + x + 8, result.val[2]); + vst1q_s32(output_ptr + x + 12, result.val[3]); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); - *(output_ptr + x) = tmp; - } - }, - input1, input2, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + *(output_ptr + x) = tmp; + } + }, + input1, input2, dst); } template <bool is_scale255, bool is_sat> @@ -757,79 +755,80 @@ void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const const auto window_end_x = static_cast<int>(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); - const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); + const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); - uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); - const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); - uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); - const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); - tmp1_high = vmulq_u16(tmp1_high, tmp2_high); - tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); - if(is_scale255) - { - tmp1_high = scale255_U16_U16(tmp1_high); - tmp1_low = scale255_U16_U16(tmp1_low); - } - else - { - const int16x8_t vn = vdupq_n_s16(-n); + if (is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); - if(is_sat) + if (is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + if (is_sat) { - tmp1_high = vqshlq_u16(tmp1_high, vn); - tmp1_low = vqshlq_u16(tmp1_low, vn); + vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); } else { - tmp1_high = vshlq_u16(tmp1_high, vn); - tmp1_low = vshlq_u16(tmp1_low, vn); + vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); } } - if(is_sat) - { - vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); - } - else - { - vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x)); - - if(is_scale255) - { - float tmp_f = static_cast<float>(tmp) * scale255_constant; - tmp = static_cast<uint16_t>(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } - if(is_sat && tmp > 255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = 255; + uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x)); + + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<uint16_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + if (is_sat && tmp > 255) + { + tmp = 255; + } + *(output_ptr + x) = static_cast<uint8_t>(tmp); } - *(output_ptr + x) = static_cast<uint8_t>(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template <bool is_scale255, bool is_sat> @@ -843,7 +842,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & tmp1_high = vmulq_s32(tmp1_high, tmp2_high); tmp1_low = vmulq_s32(tmp1_low, tmp2_low); - if(is_scale255) + if (is_scale255) { tmp1_high = scale255_S32_S32(tmp1_high); tmp1_low = scale255_S32_S32(tmp1_low); @@ -863,7 +862,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); - if(is_sat) + if (is_sat) { tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); @@ -875,7 +874,7 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & } } - if(is_sat) + if (is_sat) { return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); } @@ -888,15 +887,10 @@ inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t & template <bool is_scale255, bool is_sat> inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) { - const int16x8x2_t result = - { - { - // First 8 elements - mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n), - // Second 8 elements - mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n) - } - }; + const int16x8x2_t result = {{// First 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}}; return result; } @@ -923,67 +917,62 @@ void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_end_x = static_cast<int>(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vld1q_s16(input2_ptr + x), - vld1q_s16(input2_ptr + x + 8), - } - }; - const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); - - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const int16x8x2_t ta2 = {{ + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + }}; + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } - if(is_scale255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = static_cast<float>(tmp) * scale255_constant; + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); - tmp = static_cast<int32_t>(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) + if (is_scale255) { - tmp >>= n; + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); } else { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast<int32_t>(mask)) >> n; + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast<int16_t>(tmp); } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast<int16_t>(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template <bool is_sat> @@ -1012,7 +1001,7 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t & const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); - if(is_sat) + if (is_sat) { tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); @@ -1029,15 +1018,10 @@ inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t & template <bool is_sat> inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) { - const int32x4x2_t result = - { - { - // First 4 elements - mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n), - // Second 4 elements - mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n) - } - }; + const int32x4x2_t result = {{// First 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n), + // Second 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}}; return result; } @@ -1058,7 +1042,7 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1074,60 +1058,56 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr()); - const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); + const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int32x4x2_t broadcast_v = - { - { - broadcast_value_vec, - broadcast_value_vec, - } - }; - const int32x4x2_t non_broadcast_v = + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int32x4x2_t broadcast_v = {{ + broadcast_value_vec, + broadcast_value_vec, + }}; + const int32x4x2_t non_broadcast_v = {{ vld1q_s32(non_broadcast_input_ptr + x), vld1q_s32(non_broadcast_input_ptr + x + 4), - } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast<int64_t>(mask)) >> n; + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); } - if(is_sat) + + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = utility::clamp<int64_t, int32_t>(tmp); + int64_t tmp = + static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x)); + + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; + } + if (is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); } - *(output_ptr + x) = static_cast<int32_t>(tmp); - } - }, - broadcast_input, non_broadcast_input, dst); + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1140,58 +1120,53 @@ void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int32x4x2_t ta1 = + const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t ta1 = {{ + vld1q_s32(input1_ptr + x), + vld1q_s32(input1_ptr + x + 4), + }}; + const int32x4x2_t ta2 = {{ + vld1q_s32(input2_ptr + x), + vld1q_s32(input2_ptr + x + 4), + }}; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { + int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x)); + + if (tmp >= 0) { - vld1q_s32(input1_ptr + x), - vld1q_s32(input1_ptr + x + 4), + tmp >>= n; } - }; - const int32x4x2_t ta2 = - { + else { - vld1q_s32(input2_ptr + x), - vld1q_s32(input2_ptr + x + 4), + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; } - }; - const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n); - - vst1q_s32(output_ptr + x, result.val[0]); - vst1q_s32(output_ptr + x + 4, result.val[1]); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x)); - - if(tmp >= 0) - { - tmp >>= n; - } - else - { - uint64_t mask = ((uint64_t)1u << n) - 1; - tmp = (tmp + static_cast<int64_t>(mask)) >> n; - } - if(is_sat) - { - tmp = utility::clamp<int64_t, int32_t>(tmp); + if (is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); } - *(output_ptr + x) = static_cast<int32_t>(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } } @@ -1212,7 +1187,7 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1228,32 +1203,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); - wrapper::vstore(output_ptr + x, res); - } + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1266,32 +1242,33 @@ void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto ta1 = wrapper::vloadq(input1_ptr + x); - const auto ta2 = wrapper::vloadq(input2_ptr + x); - const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); - const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); - wrapper::vstore(output_ptr + x, res); - } + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); } } @@ -1312,7 +1289,7 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1328,48 +1305,49 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); - float32x4_t b = vdupq_n_f32(broadcast_value); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); + float32x4_t b = vdupq_n_f32(broadcast_value); - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - float32x4_t res = wrapper::vmul(tmp0, b); - b = wrapper::vmul(b, mask); + float32x4_t res = wrapper::vmul(tmp0, b); + b = wrapper::vmul(b, mask); - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); - const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); - auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); - auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - broadcast_input, non_broadcast_input, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); + const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); + auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); + auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1382,51 +1360,52 @@ void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); - float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); - const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; - const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); - const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); - const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); - const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); + float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); - const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); - const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); - float32x4_t res = wrapper::vmul(tmp0, b); + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); - b = wrapper::vrev64(b); - b = wrapper::vmul(b, mask); + float32x4_t res = wrapper::vmul(tmp0, b); - res = wrapper::vmla(res, tmp1, b); - wrapper::vstore(output_ptr + 2 * x, res); - } + b = wrapper::vrev64(b); + b = wrapper::vmul(b, mask); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto a0 = *(input1_ptr + 2 * x); - const auto a1 = *(input1_ptr + 2 * x + 1); - const auto b0 = *(input2_ptr + 2 * x); - const auto b1 = *(input2_ptr + 2 * x + 1); - auto res1 = a0 * b0 - a1 * b1; - auto res2 = a0 * b1 + a1 * b0; - *(output_ptr + 2 * x) = res1; - *(output_ptr + 2 * x + 1) = res2; - } - }, - input1, input2, dst); + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto a0 = *(input1_ptr + 2 * x); + const auto a1 = *(input1_ptr + 2 * x + 1); + const auto b0 = *(input2_ptr + 2 * x); + const auto b1 = *(input2_ptr + 2 * x + 1); + auto res1 = a0 * b0 - a1 * b1; + auto res2 = a0 * b1 + a1 * b0; + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + input1, input2, dst); } } @@ -1444,7 +1423,7 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -1457,48 +1436,40 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr()); - const float16x8x2_t broadcast_value_vec = + win, + [&](const Coordinates &) { - { + const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = {{ vdupq_n_f16(broadcast_value), vdupq_n_f16(broadcast_value), - } - }; - const auto scale_vec = vdupq_n_f16(scale); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t non_broadcast_v = + }}; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t non_broadcast_v = {{ vld1q_f16(non_broadcast_input_ptr + x), vld1q_f16(non_broadcast_input_ptr + x + 8), - } - }; - const float16x8x2_t result = + }}; + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), - vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; - } - }, - broadcast_input, non_broadcast_input, dst); + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); } else { @@ -1508,49 +1479,41 @@ void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, con Iterator input2(src2, input2_win); Iterator dst(out, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const float16x8x2_t ta1 = + win, + [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(input1_ptr + x), - vld1q_f16(input1_ptr + x + 8), - } - }; - const float16x8x2_t ta2 = - { - { - vld1q_f16(input2_ptr + x), - vld1q_f16(input2_ptr + x + 8), - } - }; - const float16x8_t scale_vec = vdupq_n_f16(scale); - const float16x8x2_t result = + const float16x8x2_t ta1 = {{ + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + }}; + const float16x8x2_t ta2 = {{ + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + }}; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = {{ + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + }}; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) { - { - vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), - vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), - } - }; - vst1q_f16(output_ptr + x, result.val[0]); - vst1q_f16(output_ptr + x + 8, result.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto ta1 = *(input1_ptr + x); - const auto ta2 = *(input2_ptr + x); - *(output_ptr + x) = ta1 * ta2 * scale; - } - }, - input1, input2, dst); + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); } } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -1577,81 +1540,82 @@ void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const const auto window_end_x = static_cast<int>(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); - const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - - uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); - uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); - tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); - tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - if(is_scale255) - { - tmp_low = scale255_U16_U16(tmp_low); - tmp_high = scale255_U16_U16(tmp_high); - } - else + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const int16x8_t vn = vdupq_n_s16(-n); + const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); + const uint8x16_t av = wrapper::vloadq(input1_ptr + x); - if(is_sat) + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if (is_scale255) { - tmp_low = vqshlq_u16(tmp_low, vn); - tmp_high = vqshlq_u16(tmp_high, vn); + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); } else { - tmp_low = vshlq_u16(tmp_low, vn); - tmp_high = vshlq_u16(tmp_high, vn); + const int16x8_t vn = vdupq_n_s16(-n); + + if (is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } } - } - if(is_sat) - { - static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + if (is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); - tmp_low = vminq_u16(tmp_low, max); - tmp_high = vminq_u16(tmp_high, max); + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); } - vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); - vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + if (is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } - if(is_scale255) - { - float tmp_f = static_cast<float>(tmp) * scale255_constant; - tmp = static_cast<int32_t>(tmp_f + 0.5f); - } - else - { - tmp >>= n; - } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + *(output_ptr + x) = static_cast<int16_t>(tmp); } - - *(output_ptr + x) = static_cast<int16_t>(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template <bool is_scale255, bool is_sat> @@ -1676,75 +1640,65 @@ void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons const auto window_end_x = static_cast<int>(window.x().end()); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int16x8x2_t ta1 = - { - { - vld1q_s16(input1_ptr + x), - vld1q_s16(input1_ptr + x + 8), - } - }; - const uint8x8x2_t ta2u = + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8x2_t ta1 = {{ + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + }}; + const uint8x8x2_t ta2u = {{ vld1_u8(input2_ptr + x), vld1_u8(input2_ptr + x + 8), - } - }; - const int16x8x2_t ta2 = - { - { - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), - vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) - } - }; - - const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + }}; + const int16x8x2_t ta2 = { + {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}}; - vst1q_s16(output_ptr + x, result.val[0]); - vst1q_s16(output_ptr + x + 8, result.val[1]); - } + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } - if(is_scale255) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = static_cast<float>(tmp) * scale255_constant; + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); - tmp = static_cast<int32_t>(tmp_f + 0.5f); - } - else - { - if(tmp >= 0) + if (is_scale255) { - tmp >>= n; + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); } else { - uint32_t mask = (1u << n) - 1; - tmp = (tmp + static_cast<int32_t>(mask)) >> n; + if (tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } + } + if (is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); } + *(output_ptr + x) = static_cast<int16_t>(tmp); } - if(is_sat) - { - tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); - } - *(output_ptr + x) = static_cast<int16_t>(tmp); - } - }, - input1, input2, dst); + }, + input1, input2, dst); } template <bool is_scale255, bool is_sat> @@ -1755,7 +1709,12 @@ void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, cons } } // namespace -void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +void CpuMulKernel::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); @@ -1775,7 +1734,7 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * bool is_scale_255 = false; // Check and validate scaling factor - if(std::abs(scale - scale255_constant) < 0.00001f) + if (std::abs(scale - scale255_constant) < 0.00001f) { is_scale_255 = true; } @@ -1795,12 +1754,12 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * const DataType dt_output = dst->data_type(); const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); - switch(dt_input1) + switch (dt_input1) { case DataType::QASYMM8: - if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) + if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) { - if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) { _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>; } @@ -1811,9 +1770,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::QASYMM8_SIGNED: - if(dt_input2 == DataType::QASYMM8_SIGNED) + if (dt_input2 == DataType::QASYMM8_SIGNED) { - if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) + if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale)) { _func_quantized = &mul_q8_neon_fixedpoint<int8_t>; } @@ -1824,19 +1783,19 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::QSYMM16: - if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) + if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) { _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; } - else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) + else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) { _func_int = &mul_QSYMM16_QSYMM16_S32; } break; case DataType::S16: - if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>; } @@ -1845,9 +1804,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>; } } - if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>; } @@ -1858,15 +1817,15 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * } break; case DataType::S32: - if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) + if (DataType::S32 == dt_input2 && DataType::S32 == dt_output) { _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>; } break; case DataType::U8: - if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) + if (DataType::U8 == dt_input2 && DataType::U8 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>; } @@ -1875,9 +1834,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>; } } - else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>; } @@ -1886,9 +1845,9 @@ void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo * _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>; } } - else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output) { - if(is_scale_255) + if (is_scale_255) { _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>; } @@ -1922,20 +1881,20 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_func_float == &mul_F32_F32_F32) + if (this->_func_float == &mul_F32_F32_F32) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } else { - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -1945,7 +1904,7 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -1958,10 +1917,10 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return std::max(static_cast<size_t>(1), mws); } } -#else /* ENABLE_FP32_KERNELS */ +#else /* ENABLE_FP32_KERNELS */ ARM_COMPUTE_UNUSED(platform); #endif /* ENABLE_FP32_KERNELS */ - if(_split_dimension == Window::DimX) + if (_split_dimension == Window::DimX) { // Don't split the work load too small if the tensor has been reinterpreted as 1D. // This number is loosely chosen as threading overhead in each platform varies wildly. @@ -1970,8 +1929,12 @@ size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return default_mws; } -Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, - RoundingPolicy rounding_policy) +Status CpuMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); @@ -1989,11 +1952,11 @@ void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const Thre auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_func_quantized != nullptr) + if (_func_quantized != nullptr) { (*_func_quantized)(src1, src2, dst, window, _scale); } - else if(_func_int != nullptr) + else if (_func_int != nullptr) { (*_func_int)(src1, src2, dst, window, _scale_exponent); } @@ -2021,10 +1984,11 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); } return Status{}; diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h index 9e4a37110b..7eaf287507 100644 --- a/src/cpu/kernels/CpuMulKernel.h +++ b/src/cpu/kernels/CpuMulKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_MUL_KERNEL_H #include "arm_compute/core/Rounding.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -68,17 +69,27 @@ public: * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype * @param[in] rounding_policy Rounding policy. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); // Inherited methods overridden - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Return minimum workload size of the relevant kernel @@ -108,7 +119,8 @@ private: * @param[in] window Region on which to execute the kernel * @param[in] scale Integer scale factor. */ - using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); + using MulFunctionInt = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); /** Common signature for all the specialised multiplication functions with float scaling factor * * @param[in] src1 Src1 tensor object. @@ -117,7 +129,8 @@ private: * @param[in] window Region on which to execute the kernel * @param[in] scale Float scale factor. */ - using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + using MulFunctionFloat = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor * * @param[in] src1 Src1 tensor object. @@ -127,14 +140,15 @@ private: * @param[in] scale Float scale factor. * */ - using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + using MulFunctionQuantized = + void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); - MulFunctionFloat *_func_float{ nullptr }; - MulFunctionInt *_func_int{ nullptr }; - MulFunctionQuantized *_func_quantized{ nullptr }; - float _scale{ 0 }; - int _scale_exponent{ 0 }; - size_t _split_dimension{ Window::DimY }; + MulFunctionFloat *_func_float{nullptr}; + MulFunctionInt *_func_int{nullptr}; + MulFunctionQuantized *_func_quantized{nullptr}; + float _scale{0}; + int _scale_exponent{0}; + size_t _split_dimension{Window::DimY}; }; /** Interface for the complex pixelwise multiplication kernel. */ @@ -159,7 +173,7 @@ public: static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp index d65e011032..b444a25ff7 100644 --- a/src/cpu/kernels/CpuPermuteKernel.cpp +++ b/src/cpu/kernels/CpuPermuteKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -48,56 +49,31 @@ namespace { inline bool is_permutation_supported(const PermutationVector &v) { - static const std::array<PermutationVector, 2> permutations2 = - { - { - PermutationVector(0U, 1U), - PermutationVector(1U, 0U), - } - }; - static const std::array<PermutationVector, 6> permutations3 = - { - { - PermutationVector(2U, 0U, 1U), - PermutationVector(1U, 2U, 0U), - PermutationVector(0U, 1U, 2U), - PermutationVector(0U, 2U, 1U), - PermutationVector(1U, 0U, 2U), - PermutationVector(2U, 1U, 0U), - } - }; - static const std::array<PermutationVector, 24> permutations4 = - { - { - PermutationVector(0U, 1U, 2U, 3U), - PermutationVector(1U, 0U, 2U, 3U), - PermutationVector(2U, 0U, 1U, 3U), - PermutationVector(0U, 2U, 1U, 3U), - PermutationVector(1U, 2U, 0U, 3U), - PermutationVector(2U, 1U, 0U, 3U), - PermutationVector(2U, 1U, 3U, 0U), - PermutationVector(1U, 2U, 3U, 0U), - PermutationVector(3U, 2U, 1U, 0U), - PermutationVector(2U, 3U, 1U, 0U), - PermutationVector(1U, 3U, 2U, 0U), - PermutationVector(3U, 1U, 2U, 0U), - PermutationVector(3U, 0U, 2U, 1U), - PermutationVector(0U, 3U, 2U, 1U), - PermutationVector(2U, 3U, 0U, 1U), - PermutationVector(3U, 2U, 0U, 1U), - PermutationVector(0U, 2U, 3U, 1U), - PermutationVector(2U, 0U, 3U, 1U), - PermutationVector(1U, 0U, 3U, 2U), - PermutationVector(0U, 1U, 3U, 2U), - PermutationVector(3U, 1U, 0U, 2U), - PermutationVector(1U, 3U, 0U, 2U), - PermutationVector(0U, 3U, 1U, 2U), - PermutationVector(3U, 0U, 1U, 2U) - } - }; + static const std::array<PermutationVector, 2> permutations2 = {{ + PermutationVector(0U, 1U), + PermutationVector(1U, 0U), + }}; + static const std::array<PermutationVector, 6> permutations3 = {{ + PermutationVector(2U, 0U, 1U), + PermutationVector(1U, 2U, 0U), + PermutationVector(0U, 1U, 2U), + PermutationVector(0U, 2U, 1U), + PermutationVector(1U, 0U, 2U), + PermutationVector(2U, 1U, 0U), + }}; + static const std::array<PermutationVector, 24> permutations4 = { + {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U), + PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U), + PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U), + PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U), + PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U), + PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U), + PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U), + PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}}; - return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) - || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); + return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || + (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) || + (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); } Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) @@ -108,7 +84,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); // Validate configured destination - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -128,18 +104,22 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others // we have to fall back to C++ - if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })) + if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) || + (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})) { - window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); - window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); - window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); + window_src.set(Window::DimX, + Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); + window_src.set(Window::DimY, + Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); + window_src.set(Window::DimZ, + Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start())); } // Destination window Window window_dst(window); const Window::Dimension zero_window = Window::Dimension(0, 0, 0); - for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d) + for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d) { window_dst.set(d, zero_window); } @@ -157,7 +137,7 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c int n_channels = 0; int n_batches = 0; - switch(src_layout) + switch (src_layout) { case DataLayout::NCHW: { @@ -189,38 +169,42 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c } // CHW -> HWC - if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) + if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) { const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; - reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx, - n_batches, n_channels, n_rows, n_cols, - in_batch_stride, in_channel_stride, in_row_stride, - out_batch_stride, out_row_stride, out_col_stride); - }, - src_it, dst_it); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; + reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), + reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols, + in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride, + out_row_stride, out_col_stride); + }, + src_it, dst_it); } // HWC -> CHW - else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }) + else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}) { const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); - execute_window_loop(window_src, [&](const Coordinates & id) - { - const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; - reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx, - n_batches, n_rows, n_cols, n_channels, - in_batch_stride, in_row_stride, in_col_stride, - out_batch_stride, out_channel_stride, out_row_stride); - }, - src_it, dst_it); + execute_window_loop( + window_src, + [&](const Coordinates &id) + { + const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; + reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), + reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels, + in_batch_stride, in_row_stride, in_col_stride, out_batch_stride, + out_channel_stride, out_row_stride); + }, + src_it, dst_it); } else { @@ -230,12 +214,15 @@ void run_permute(const Window &window, const ITensor *src, const ITensor *dst, c Strides perm_strides = strides; permute_strides(perm_strides, perm); const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; - *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr())); - }, - src_it, dst_it); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int idx = + id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; + *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr())); + }, + src_it, dst_it); } } } // namespace @@ -275,7 +262,7 @@ void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(src->info()->element_size()) + switch (src->info()->element_size()) { case 1: run_permute<uint8_t>(window, src, dst, _perm); diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h index 9e1b93318e..0cb2faf223 100644 --- a/src/cpu/kernels/CpuPermuteKernel.h +++ b/src/cpu/kernels/CpuPermuteKernel.h @@ -57,7 +57,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp index d72a41cbbe..9308d860d1 100644 --- a/src/cpu/kernels/CpuPool2dKernel.cpp +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -25,15 +25,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/cpu/kernels/pool2d/neon/list.h" #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + #include <arm_neon.h> namespace arm_compute @@ -46,99 +48,111 @@ namespace { using namespace misc::shape_calculator; -static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = -{ - { - "neon_qu8_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc) - }, - { - "neon_qs8_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc) - }, - { - "neon_f16_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc) - }, - { - "neon_fp32_nhwc_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc) - }, +static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = { + {"neon_qu8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)}, + {"neon_qs8_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)}, + {"neon_f16_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)}, + {"neon_fp32_nhwc_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)}, #if defined(ENABLE_NCHW_KERNELS) - { - "neon_qu8_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>) - }, - { - "neon_qu8_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>) - }, - { - "neon_qu8_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>) - }, - { - "neon_qs8_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>) - }, - { - "neon_qs8_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>) - }, - { - "neon_qs8_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>) - }, - { - "neon_fp16_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw) - }, - { - "neon_fp16_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); }, - REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw) - }, - { - "neon_fp32_nchw_pool2", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool3", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_pool7", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); }, - REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw) - }, - { - "neon_fp32_nchw_poolMxN", - [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, - REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw) - }, + {"neon_qu8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)}, + {"neon_qu8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)}, + {"neon_qu8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)}, + {"neon_qs8_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)}, + {"neon_qs8_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); + }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)}, + {"neon_qs8_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)}, + {"neon_fp16_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)}, + {"neon_fp16_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)}, + {"neon_fp16_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)}, + {"neon_fp32_nchw_pool2", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool3", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)}, + {"neon_fp32_nchw_pool7", + [](const PoolDataTypeISASelectorData &data) + { + return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && + (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); + }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)}, + {"neon_fp32_nchw_poolMxN", + [](const PoolDataTypeISASelectorData &data) + { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)}, #endif /* defined(ENABLE_NCHW_KERNELS) */ }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, - const ITensorInfo *indices, Size2D pool_size) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices, + Size2D pool_size) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0); @@ -150,65 +164,78 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; PoolingType pool_type = pool_info.pool_type; const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) - && (is_pool_region_entirely_outside_input(pool_info)), - "Pooling region that is entirely outside input tensor is unsupported for non-float types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)), + "Pooling region that is entirely outside input tensor is unsupported for non-float types"); - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size.x(), pool_size.y(), pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + std::tie(output_width, output_height) = + scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(), + pool_size.y(), pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), + "Calculated output dimension size is invalid"); TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - if(indices) + if (indices) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); } - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type())); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() - && (src->data_layout() == DataLayout::NHWC), - "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && + (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() && + (src->data_layout() == DataLayout::NHWC), + "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); - if(indices) + if (indices) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), + "Pooling indices returning source tensor coordinates is only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), + "Pooling kernel indices only supported for NHWC"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info); } } - const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() }); + const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ + src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info, - unsigned int &num_elems_processed_per_iteration, - int pool_size_x, int pool_size_y) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, + ITensorInfo *dst, + ITensorInfo *indices, + const PoolingLayerInfo &pool_info, + unsigned int &num_elems_processed_per_iteration, + int pool_size_x, + int pool_size_y) { // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))); - if(indices) + if (indices) { // Indices auto inizialitation if not yet initialized - auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, - pool_info))) - .set_data_type(DataType::U32) /* we store the offset to the element */); + auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))) + .set_data_type(DataType::U32) /* we store the offset to the element */); } const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; @@ -219,20 +246,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const bool is_square = pool_size_x == pool_size_y; - const unsigned int pooled_w = dst->dimension(idx_width); - const unsigned int pooled_h = dst->dimension(idx_height); + const bool is_square = pool_size_x == pool_size_y; + const unsigned int pooled_w = dst->dimension(idx_width); + const unsigned int pooled_h = dst->dimension(idx_height); //If it's not squared and optimized will be executed the MxN num_elems_processed_per_iteration = 1; - if(is_square) + if (is_square) { - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: - switch(pool_size_x) + switch (pool_size_x) { case 2: num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; @@ -261,18 +288,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso bool window_changed = false; Window win{}; // Upper limit for the number of right/bottom border elements that are accessed - TensorShape dst_shape{ src->tensor_shape() }; + TensorShape dst_shape{src->tensor_shape()}; dst_shape.set(0, pooled_w); dst_shape.set(1, pooled_h); TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape)); win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration)); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +void CpuPool2dKernel::configure(ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; @@ -284,14 +315,15 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Update pool size in case of global pooling - const Size2D pool_size( - is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, - is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); + const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); - const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() }); + const auto *uk = CpuPool2dKernel::get_implementation( + PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, + pool_size, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); // Set instance variables @@ -302,7 +334,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin _run_method = uk->ukernel; _name = std::string("CpuPool2dKernel").append("/").append(uk->name); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { // Configure kernel window Window win = calculate_max_window(*dst, Steps()); @@ -311,14 +343,17 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin else { // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration, - pool_size.x(), pool_size.y()); + auto win_config = validate_and_configure_window( + src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y()); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICpuKernel::configure(win_config.second); } } -Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CpuPool2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); @@ -336,9 +371,10 @@ Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y))); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), - (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, - pool_size_x, pool_size_y) - .first); + (indices) ? indices->clone().get() : nullptr, pool_info, + num_elems_processed_per_iteration, pool_size_x, + pool_size_y) + .first); return Status{}; } @@ -359,19 +395,20 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T const unsigned int pool_size = _pool_info.pool_size.width; Window window_src(window); - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { // Set step for src in x and y direction for the src unsigned int window_x_inc = 0; - switch(src->info()->data_type()) + switch (src->info()->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: { window_x_inc = pool_stride_x; - if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) { - window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 + : _num_elems_processed_per_iteration; } break; } @@ -387,8 +424,10 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T ARM_COMPUTE_ERROR("Not supported"); } } - window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); - window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); + window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, + window.x().end() * pool_stride_x, window_x_inc)); + window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, + window.y().end() * pool_stride_y, pool_stride_y)); } else { diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h index c952ea839d..859de8cc5f 100644 --- a/src/cpu/kernels/CpuPool2dKernel.h +++ b/src/cpu/kernels/CpuPool2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL2D_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -38,7 +39,8 @@ namespace kernels class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel> { private: - using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type; + using PoolingKernelPtr = std::add_pointer<void( + const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type; public: CpuPool2dKernel() = default; @@ -52,17 +54,21 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuPool2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct PoolingKernel @@ -76,11 +82,11 @@ public: private: PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 0 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{0}; Size2D _pool_size{}; int _pool_stride_x{}; - PoolingKernelPtr _run_method{ nullptr }; + PoolingKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp index 4504f3f7c9..8b484d4e0b 100644 --- a/src/cpu/kernels/CpuPool3dKernel.cpp +++ b/src/cpu/kernels/CpuPool3dKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/pool3d/list.h" @@ -41,39 +42,28 @@ namespace { using namespace misc::shape_calculator; -static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = -{ - { - "neon_qu8_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d) - }, - { - "neon_qs8_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d) - }, - { - "neon_fp16_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d) - }, - { - "neon_fp32_ndhwc_poolMxNxD", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d) - } -}; +static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = { + {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)}, + {"neon_qs8_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)}, + {"neon_fp16_ndhwc_poolMxNxD", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)}, + {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}}; Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding - && (pool_info.pool_type == PoolingType::AVG)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && + (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)), "Exclude padding is unsupported for non-float types for Avg op"); const auto data_layout = src->data_layout(); @@ -97,21 +87,26 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; int output_depth = 0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); - std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth], - pool_size_x, pool_size_y, pool_size_z, pool_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid"); + std::tie(output_width, output_height, output_depth) = + scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), + "Calculated output dimension size is invalid"); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); - TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC)); + TensorInfo out_info( + TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); } - const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); return Status{}; @@ -136,12 +131,12 @@ void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const // Update pool size in case of global pooling const bool is_global_pooling = pool_info.is_global_pooling; - const Size3D pool_size( - is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, - is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height, - is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth); + const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height, + is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth); - const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = + CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); // Set instance variables @@ -188,4 +183,4 @@ const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_availa } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h index 437f2af7e4..bd1ff61046 100644 --- a/src/cpu/kernels/CpuPool3dKernel.h +++ b/src/cpu/kernels/CpuPool3dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL3D_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,7 +40,8 @@ class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel> { private: /* Template function for Pooling 3D NDHWC */ - using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type; + using Pooling3dKernelPtr = + std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type; public: CpuPool3dKernel() = default; @@ -68,7 +70,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct Pooling3dKernel @@ -82,11 +84,11 @@ public: private: Pooling3dLayerInfo _pool_info{}; - Pooling3dKernelPtr _run_method{ nullptr }; + Pooling3dKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels } // namespace cpu } // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
\ No newline at end of file +#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp index 9700c62318..5dde680837 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.cpp +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -28,13 +28,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" - -#include "src/core/CPP/Validate.h" #include <arm_neon.h> #include <map> @@ -53,9 +53,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); return Status{}; @@ -71,19 +73,15 @@ inline float32x4x4_t load_value(const T *input_ptr) template <> inline float32x4x4_t load_value(const float *input_ptr) { - return { wrapper::vloadq(input_ptr), - wrapper::vloadq(input_ptr + 4), - wrapper::vloadq(input_ptr + 8), - wrapper::vloadq(input_ptr + 12) }; + return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12)}; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> inline float32x4x4_t load_value(const float16_t *input_ptr) { - return { vcvt_f32_f16(wrapper::vload(input_ptr)), - vcvt_f32_f16(wrapper::vload(input_ptr + 4)), - vcvt_f32_f16(wrapper::vload(input_ptr + 8)), - vcvt_f32_f16(wrapper::vload(input_ptr + 12)) }; + return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))}; } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -113,26 +111,25 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = - { - { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> }, - { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> }, - { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> }, + static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = { + {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>}, + {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>}, + {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>}, - { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> }, - { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> }, - { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> }, + {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>}, + {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>}, + {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>}, - { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> }, + {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>}, - { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> }, - { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> }, - { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> }, + {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>}, + {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>}, + {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> }, - { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> }, - { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> }, + {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>}, + {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>}, + {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ }; @@ -142,7 +139,7 @@ void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) auto it = quant_map.find(function_to_call); - if(it == quant_map.end()) + if (it == quant_map.end()) { ARM_COMPUTE_ERROR("Unsupported combination of input and output data types"); } @@ -167,7 +164,7 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -177,22 +174,24 @@ void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, co Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); - } - }, - input, output); + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info()); + } + }, + input, output); } template <typename TIn, typename TOut> @@ -203,7 +202,7 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -219,23 +218,25 @@ void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, c Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); - auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); } template <typename T> @@ -246,7 +247,7 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); - if(is_data_type_quantized_asymmetric(src->info()->data_type())) + if (is_data_type_quantized_asymmetric(src->info()->data_type())) { uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); } @@ -262,25 +263,27 @@ void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step); x += window_step) - { - uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); - vst1q_u16(&output_ptr[x], tmp.val[0]); - vst1q_u16(&output_ptr[x + 8], tmp.val[1]); - } - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); - } - }, - input, output); + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); } void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h index 2bc8105a11..d6714136da 100644 --- a/src/cpu/kernels/CpuQuantizeKernel.h +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -59,7 +59,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: @@ -67,7 +67,9 @@ private: * * @param[in] window Region on which to execute the kernel. */ - using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, + ITensor *dst, + const Window &window); /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. * * @param[in] window Region on which to execute the kernel. @@ -84,7 +86,7 @@ private: template <typename TIn, typename TOut> void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window); - QuantizeFunctionExecutorPtr _func{ nullptr }; + QuantizeFunctionExecutorPtr _func{nullptr}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp index a9672a8c5e..241e58fbce 100644 --- a/src/cpu/kernels/CpuReshapeKernel.cpp +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -29,9 +29,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/INEKernel.h" + #include "src/core/helpers/Utils.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" + #include <cstdint> /** [NEReshapeLayerKernel Kernel] **/ @@ -49,7 +51,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->tensor_shape().total_size() != 0) + if (dst->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -59,29 +61,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) return Status{}; } - template <typename T> void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst) { const TensorShape &src_shape = src->info()->tensor_shape(); const TensorShape &dst_shape = dst->info()->tensor_shape(); - Iterator dst_it(dst, window); + Iterator dst_it(dst, window); - execute_window_loop(window, [&](const Coordinates & dst_coord) - { - Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); - const auto output_ptr = dst->ptr_to_element(dst_coord); - const auto input_ptr = src->ptr_to_element(src_coord); + execute_window_loop( + window, + [&](const Coordinates &dst_coord) + { + Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + const auto output_ptr = dst->ptr_to_element(dst_coord); + const auto input_ptr = src->ptr_to_element(src_coord); - *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr); - }, - dst_it); + *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr); + }, + dst_it); } -void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst ) +void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst) { - switch(src->info()->data_type()) + switch (src->info()->data_type()) { case DataType::U8: case DataType::S8: @@ -131,22 +134,24 @@ void reshape_tensor_per_row(const Window &window, const ITensor *src, ITensor *d win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator dst_it(dst, win); - execute_window_loop(win, [&]( Coordinates & id) - { - dst_coord = id; - - for(int x = window_start_x; x < window_end_x; x += src_row_size) + execute_window_loop( + win, + [&](Coordinates &id) { - src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); - output_ptr = dst->ptr_to_element(dst_coord); - input_ptr = src->ptr_to_element(src_coord); + dst_coord = id; - std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + for (int x = window_start_x; x < window_end_x; x += src_row_size) + { + src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord)); + output_ptr = dst->ptr_to_element(dst_coord); + input_ptr = src->ptr_to_element(src_coord); - dst_coord.increment(Window::DimX, src_row_size); - } - }, - dst_it); + std::memcpy(output_ptr, input_ptr, row_size_in_bytes); + + dst_coord.increment(Window::DimX, src_row_size); + } + }, + dst_it); } void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst) @@ -213,8 +218,8 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - const ITensorInfo* src_info = src->info(); - const ITensorInfo* dst_info = dst->info(); + const ITensorInfo *src_info = src->info(); + const ITensorInfo *dst_info = dst->info(); // Calculate kernel window based on the padding info Window win; @@ -226,7 +231,7 @@ void CpuReshapeKernel::prepare(ITensorPack &tensors) const auto src_row_size = static_cast<int>(src_info->tensor_shape()[0]); const auto dst_row_size = static_cast<int>(dst_info->tensor_shape()[0]); - if(!src_has_holes && !dst_has_holes) + if (!src_has_holes && !dst_has_holes) { std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info); /* diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h index eddbbf7135..ce566fd9e2 100644 --- a/src/cpu/kernels/CpuReshapeKernel.h +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -55,7 +55,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes @@ -84,10 +84,9 @@ public: } private: - size_t _split_dimension{ Window::DimY }; - - std::function<void(const Window &window, const ITensor *src, ITensor *dst )> _reshape_tensor_fn{}; + size_t _split_dimension{Window::DimY}; + std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp index 332304599f..702e0a8134 100644 --- a/src/cpu/kernels/CpuScaleKernel.cpp +++ b/src/cpu/kernels/CpuScaleKernel.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/InterpolationPolicyUtils.h" #include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/ScaleHelpers.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,104 +45,74 @@ namespace kernels { namespace { -static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = -{ - { - "sve_fp16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) - }, - { - "sve_fp32_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) - }, - { - "sve_qu8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) - }, - { - "sve_qs8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) - }, - { - "sve_u8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) - }, - { - "sve_s16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) - { - return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; - }, - REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) - }, - { - "neon_fp16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>) - }, - { - "neon_fp32_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>) - }, - { - "neon_qu8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) - }, - { - "neon_qs8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) - }, - { - "neon_u8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale) - }, - { - "neon_s8_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale) - }, - { - "neon_s16_scale", - [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale) - }, +static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = { + {"sve_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)}, + {"sve_fp32_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)}, + {"sve_qu8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { + return data.dt == DataType::QASYMM8 && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)}, + {"sve_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { + return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && + data.interpolation_policy != InterpolationPolicy::BILINEAR; + }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)}, + {"sve_u8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)}, + {"sve_s16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) + { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)}, + {"neon_fp16_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)}, + {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)}, + {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)}, + {"neon_qs8_scale", + [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)}, + {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)}, + {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)}, + {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)}, }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) { - const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); ARM_COMPUTE_UNUSED(info.constant_border_value); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); @@ -153,27 +124,30 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0); ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0); - ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR - || info.border_mode != BorderMode::REPLICATE)); + ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && + (data_layout != DataLayout::NHWC || + info.interpolation_policy != InterpolationPolicy::BILINEAR || + info.border_mode != BorderMode::REPLICATE)); - if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr) + if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); } - if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr) + if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); - if(dx != nullptr && dy != nullptr) + if (dx != nullptr && dy != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); } } - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && + !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - if(info.interpolation_policy == InterpolationPolicy::AREA) + if (info.interpolation_policy == InterpolationPolicy::AREA) { ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); @@ -183,24 +157,28 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const I } } // namespace -void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, - ITensorInfo *dst, const ScaleKernelInfo &info) +void CpuScaleKernel::configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_UNUSED(dx, dy, offsets); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - dx, - dy, - offsets, - dst, - info)); - - const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy }); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info)); + + const auto *uk = CpuScaleKernel::get_implementation( + ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); _run_method = uk->ukernel; - _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy)); + _name = std::string("CpuScaleKernel") + .append("/") + .append(uk->name) + .append("_") + .append(string_from_interpolation_policy(info.interpolation_policy)); // Get data layout and width/height indices _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; @@ -212,19 +190,22 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co _constant_border_value = info.constant_border_value; _align_corners = info.align_corners; - if(info.sampling_policy == SamplingPolicy::CENTER) + if (info.sampling_policy == SamplingPolicy::CENTER) { _sampling_offset = 0.5f; } // Compute the ratio between source width/height and destination width/height - const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); + const auto wr = + scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; + _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR + : _policy; - if(_border_mode == BorderMode::UNDEFINED) + if (_border_mode == BorderMode::UNDEFINED) { _border_mode = BorderMode::CONSTANT; _constant_border_value = PixelValue(); @@ -232,39 +213,38 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co #ifdef ENABLE_NCHW_KERNELS // Configure scale function to run - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { std::string function_to_call("scale_"); function_to_call += string_from_data_type(src->data_type()) + "_"; function_to_call += string_from_data_layout(_data_layout) + "_"; function_to_call += string_from_interpolation_policy(_policy); - static std::map<std::string, ScaleFunctionPtr> map_function = - { - { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 }, + static std::map<std::string, ScaleFunctionPtr> map_function = { + {"scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8}, - { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> }, - { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> }, + {"scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t>}, + {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>}, - { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> }, - { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> }, + {"scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t>}, + {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>}, - { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> }, - { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> }, + {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t>}, + {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t>}, - { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> }, - { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> }, + {"scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t>}, + {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t>}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> }, - { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> }, + {"scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t>}, + {"scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t>}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> }, - { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> }, + {"scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float>}, + {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float>}, }; auto it = map_function.find(function_to_call); - if(it != map_function.end()) + if (it != map_function.end()) { _func = it->second; } @@ -278,13 +258,19 @@ void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, co #ifdef ENABLE_NCHW_KERNELS template <typename T> -void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy); const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); // Don't increment in X and Y direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -296,7 +282,7 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const Window win_off; win_off.set(Window::DimX, window[Window::DimX]); win_off.set(Window::DimY, window[Window::DimY]); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -305,24 +291,33 @@ void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const Iterator src_i(src, win_in); Iterator dst_i(dst, window); Iterator offsets_i(offsets, win_off); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr()); - const auto in_yi = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor(( - id.y() + _sampling_offset) - * hr)); - const int32_t offset_row = in_yi * in_stride_x; - *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row); - }, - src_i, offsets_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr()); + const auto in_yi = static_cast<int32_t>( + _align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) + : std::floor((id.y() + _sampling_offset) * hr)); + const int32_t offset_row = in_yi * in_stride_x; + *reinterpret_cast<T *>(dst_i.ptr()) = + *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row); + }, + src_i, offsets_i, dst_i); } template <typename T> -void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); - Window win_off; + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + Window win_off; win_off.set(Window::DimX, window.x()); win_off.set(Window::DimY, window.y()); @@ -332,7 +327,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -347,7 +342,7 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const const int32_t in_dim_h = src->info()->dimension(1); const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right; - if(_border_mode == BorderMode::CONSTANT) + if (_border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; @@ -355,52 +350,60 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h - && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) : - const_border_value; - - *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) + : const_border_value; + + *reinterpret_cast<T *>(dst_i.ptr()) = + static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); - const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); - const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); - const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); - const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); - - auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1); - auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); - auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1); - auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); - const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); - const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); - const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); - - *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - src_i, offsets_i, dx_i, dy_i, dst_i); + else if (_border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); + const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); + const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); + const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); + + *reinterpret_cast<T *>(dst_i.ptr()) = + static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); } else { @@ -408,7 +411,12 @@ void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const } } -void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, offsets); using namespace scale_helpers; @@ -425,50 +433,60 @@ void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const Iterator src_i(src, win_in); Iterator dst_i(dst, window); - const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + const auto wr = + scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); const auto w = src->info()->dimension(0); const auto h = src->info()->dimension(1); const size_t in_stride = src->info()->strides_in_bytes()[1]; - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr()); - - uint8x8_t tmp0 = vdup_n_u8(0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); - tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); - - uint8x8_t tmp1 = vdup_n_u8(0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); - tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); - - vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); - }, - src_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr()); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); + + vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); + }, + src_i, dst_i); } template <typename T> -void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window) { // Get data layout and width/height indices const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), + dst->info()->dimension(idx_height), _align_corners); Window win_off; win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); @@ -479,7 +497,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con win_in.set(idx_width, Window::Dimension(0, 0, 0)); win_in.set(idx_height, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -495,7 +513,7 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); - if(_border_mode == BorderMode::CONSTANT) + if (_border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; @@ -503,62 +521,74 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con using ConstType = T; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); - - const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? - (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); - *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) + ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); } - else if(_border_mode == BorderMode::REPLICATE) - { - execute_window_loop(window, [&](const Coordinates & id) - { - const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); - const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); - const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); - - auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); - - const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); - const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); - const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); - const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); - - const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); - *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - src_i, dst_i); + else if (_border_mode == BorderMode::REPLICATE) + { + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>( + offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = + *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = + *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); } else { @@ -567,8 +597,12 @@ void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, con } #endif // ENABLE_NCHW_KERNELS -Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, - const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) +Status CpuScaleKernel::validate(const ITensorInfo *input, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *output, + const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); return Status{}; @@ -588,13 +622,14 @@ void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const Th const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1); const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2); - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { (this->*_func)(src, dst, dx, dy, offsets, window); } else { - _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); + _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, + _align_corners, window); } } diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h index 8102142fc3..38142df021 100644 --- a/src/cpu/kernels/CpuScaleKernel.h +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_SCALEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" @@ -39,9 +40,19 @@ class CpuScaleKernel : public ICpuKernel<CpuScaleKernel> { private: /** Scale function to use for the particular function to use */ - using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); - using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, - InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type; + using ScaleFunctionPtr = void (CpuScaleKernel::*)( + const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); + using ScaleKernelPtr = std::add_pointer<void(const ITensor *, + ITensor *, + const ITensor *, + const ITensor *, + const ITensor *, + InterpolationPolicy, + BorderMode, + PixelValue, + float, + bool, + const Window &)>::type; public: CpuScaleKernel() = default; @@ -59,7 +70,11 @@ public: * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to use for configuration */ - void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + void configure(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * @@ -67,11 +82,15 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + static Status validate(const ITensorInfo *src, + const ITensorInfo *dx, + const ITensorInfo *dy, + const ITensorInfo *offsets, + ITensorInfo *dst, const ScaleKernelInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct ScaleKernel @@ -89,28 +108,48 @@ private: * * @note Used only in case down-sampling. */ - void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_area_nchw_u8(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using bilinear interpolation on the given window */ template <typename T> - void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_bilinear_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using bilinear interpolation on the given window */ template <typename T> - void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_bilinear_qasymm(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); /** function to perform scale using nearest neighbour on the given window */ template <typename T> - void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + void scale_nearest_nchw(const ITensor *src, + ITensor *dst, + const ITensor *dx, + const ITensor *dy, + const ITensor *offsets, + const Window &window); #endif // ENABLE_NCHW_KERNELS - ScaleFunctionPtr _func{ nullptr }; + ScaleFunctionPtr _func{nullptr}; InterpolationPolicy _policy{}; BorderMode _border_mode{}; PixelValue _constant_border_value{}; - float _sampling_offset{ 0 }; - bool _align_corners{ false }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - ScaleKernelPtr _run_method{ nullptr }; + float _sampling_offset{0}; + bool _align_corners{false}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + ScaleKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp index e06ab9917c..ce144351f8 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.cpp +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -30,11 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" #include "src/cpu/kernels/softmax/list.h" namespace arm_compute @@ -46,61 +46,44 @@ namespace kernels namespace { /* Softmax Logits 1D Max - identifying the max value of 1D Logits */ -static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = -{ - { - "sve_fp32_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_logits) - }, - { - "sve_fp16_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_logits) - }, - { - "sve_qu8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, - REGISTER_QASYMM8_SVE(sve_qasymm8_logits) - }, - { - "sve_qs8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, - REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits) - }, - { - "neon_fp32_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_logits) - }, - { - "neon_fp16_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_logits) - }, - { - "neon_qu8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(neon_qasymm8_logits) - }, - { - "neon_qs8_logits_1d_max", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits) - }, +static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = { + {"sve_fp32_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(sve_fp32_logits)}, + {"sve_fp16_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(sve_fp16_logits)}, + {"sve_qu8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; }, + REGISTER_QASYMM8_SVE(sve_qasymm8_logits)}, + {"sve_qs8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; }, + REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)}, + {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_logits)}, + {"neon_fp16_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_logits)}, + {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(neon_qasymm8_logits)}, + {"neon_qs8_logits_1d_max", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)}, }; Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); // Validate in case of configured output - if(output.total_size() != 0) + if (output.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), + TensorShape(input.tensor_shape()).set(0, 1)); } return Status{}; @@ -121,7 +104,7 @@ void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) // Output auto initialization if not yet initialized auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); - const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); _run_method = uk->ukernel; @@ -158,60 +141,46 @@ const char *CpuLogits1DMaxKernel::name() const } /* Softmax Logits 1D - computation for QASYMM8 with pre-computed max. */ -template <bool IS_LOG> -static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = -{ - { - "sve2_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, - REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax) - }, - { - "sve2_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, - REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax) - }, - { - "sve_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; }, - REGISTER_FP32_SVE(sve_fp32_softmax) - }, - { - "sve_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, - REGISTER_FP16_SVE(sve_fp16_softmax) - }, - - { - "neon_fp32_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(neon_fp32_softmax) - }, - { - "neon_fp16_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(neon_fp16_softmax) - }, - { - "neon_qu8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax) - }, - { - "neon_qs8_softmax_logits_1d", - [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax) - }, +template <bool IS_LOG> +static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = { + {"sve2_qu8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; }, + REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)}, + {"sve2_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; }, + REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)}, + {"sve_fp32_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; }, + REGISTER_FP32_SVE(sve_fp32_softmax)}, + {"sve_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; }, + REGISTER_FP16_SVE(sve_fp16_softmax)}, + + {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(neon_fp32_softmax)}, + {"neon_fp16_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(neon_fp16_softmax)}, + {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)}, + {"neon_qs8_softmax_logits_1d", + [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)}, }; namespace { -Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, - const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log) +Status validate_arguments_logits_softmax(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const float beta, + const ITensorInfo &tmp, + bool is_log) { ARM_COMPUTE_UNUSED(beta); // Check input ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); @@ -221,16 +190,18 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); // Check output if configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info(); + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) + : dst.quantization_info(); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); } // Check tmp if configured - if(tmp.total_size() != 0) + if (tmp.total_size() != 0) { const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); @@ -243,14 +214,16 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn } } // namespace -template <bool IS_LOG> -const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels() +template <bool IS_LOG> +const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> & +CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels() { return available_kernels_logits<IS_LOG>; } template <bool IS_LOG> -void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +void CpuLogits1DSoftmaxKernel<IS_LOG>::configure( + const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); @@ -259,17 +232,21 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); // Output auto initialization if not yet initialized - const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); + const QuantizationInfo output_quantization = + is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) + : dst->quantization_info(); auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); // Tmp auto initialization if not yet initialized const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); - const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() }); + const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation( + DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + std::string kernel_name = + IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); _beta = beta; _run_method = uk->ukernel; @@ -282,8 +259,8 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const I } template <bool IS_LOG> -Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate( + const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); @@ -305,7 +282,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); - const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); @@ -314,7 +291,7 @@ void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window } template <bool IS_LOG> -const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const +const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const { return _name.c_str(); } diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h index 59f43bd1d2..5d288179fd 100644 --- a/src/cpu/kernels/CpuSoftmaxKernel.h +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -57,7 +57,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct SoftmaxLogits1DMaxKernel @@ -70,7 +70,7 @@ public: static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels(); private: - SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; + SoftmaxLogits1DMaxKernelPtr _run_method{nullptr}; std::string _name{}; }; @@ -79,7 +79,8 @@ template <bool IS_LOG = false> class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>> { private: - using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; + using SoftmaxLogits1DKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; public: CpuLogits1DSoftmaxKernel() = default; @@ -95,18 +96,22 @@ public: * * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. */ - void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + void + configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuLogits1DSoftmaxKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *max, - const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); + static Status validate(const ITensorInfo *src, + const ITensorInfo *max, + const ITensorInfo *dst, + const float beta, + const ITensorInfo *tmp); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; struct SoftmaxLogits1DKernel @@ -119,8 +124,8 @@ public: static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels(); private: - float _beta{ 1.0f }; - SoftmaxLogits1DKernelPtr _run_method{ nullptr }; + float _beta{1.0f}; + SoftmaxLogits1DKernelPtr _run_method{nullptr}; std::string _name{}; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp index 875d613dca..2b2c6f2e92 100644 --- a/src/cpu/kernels/CpuSubKernel.cpp +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/add/generic/neon/impl.h" @@ -51,70 +52,48 @@ namespace using CpuSubKernelDataTypeISASelectorData = CpuAddKernelDataTypeISASelectorData; using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; -static const std::vector<CpuSubKernel::SubKernel> available_kernels = -{ - { - "neon_fp32_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); }, - REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>) - }, - { - "neon_fp16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; }, - REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>) - }, - { - "neon_u8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>) - }, - { - "neon_s16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>) - }, - { - "neon_s32_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); }, - REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>) - }, - { - "neon_qu8_sub_fixedpoint", - [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint) - }, - { - "neon_qs8_sub_fixedpoint", - [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint) - }, - { - "neon_qu8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) - }, - { - "neon_qs8_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) - }, - { - "neon_qs16_sub", - [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) - }, +static const std::vector<CpuSubKernel::SubKernel> available_kernels = { + {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)}, + {"neon_fp16_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; }, + REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)}, + {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)}, + {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)}, + {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)}, + {"neon_qu8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)}, + {"neon_qs8_sub_fixedpoint", + [](const CpuSubKernelDataTypeISASelectorData &data) + { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)}, + {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)}, + {"neon_qs8_sub", + [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)}, + {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)}, }; -inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +inline Status +validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) { ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, + DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst); - const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>( + CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); @@ -125,7 +104,7 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src "Convert policy cannot be WRAP if datatype is quantized"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), @@ -147,7 +126,8 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I set_data_type_if_unknown(*dst, src0->data_type()); const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst); - const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint }); + const auto uk = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>( + CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); @@ -167,14 +147,14 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const ARM_COMPUTE_UNUSED(thread_count); #if defined(ENABLE_FP32_KERNELS) - if(this->_run_method == &sub_same_neon<float>) + if (this->_run_method == &sub_same_neon<float>) { size_t mws = ICPPKernel::default_mws; - if(platform.get_cpu_model() == CPUModel::N1) + if (platform.get_cpu_model() == CPUModel::N1) { mws = default_mws_N1_fp32_neon; } - else if(platform.get_cpu_model() == CPUModel::V1) + else if (platform.get_cpu_model() == CPUModel::V1) { mws = default_mws_V1_fp32_neon; } @@ -184,7 +164,7 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const } // tensor is 1D or was re-interpreted as 1D - if(this->window().shape().num_dimensions() == 1) + if (this->window().shape().num_dimensions() == 1) { return mws; } @@ -203,7 +183,8 @@ size_t CpuSubKernel::get_mws(const CPUInfo &platform, size_t thread_count) const return ICPPKernel::default_mws; } -Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +Status +CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h index cd209d1837..5fa0dc411a 100644 --- a/src/cpu/kernels/CpuSubKernel.h +++ b/src/cpu/kernels/CpuSubKernel.h @@ -37,7 +37,8 @@ namespace kernels class CpuSubKernel : public ICpuKernel<CpuSubKernel> { private: - using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + using SubKernelPtr = std::add_pointer<void( + const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr; public: @@ -68,7 +69,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -99,9 +101,9 @@ public: private: ConvertPolicy _policy{}; - SubKernelPtr _run_method{ nullptr }; + SubKernelPtr _run_method{nullptr}; std::string _name{}; - size_t _split_dimension{ Window::DimY }; + size_t _split_dimension{Window::DimY}; }; } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp index b2cebc4230..615bc6ce1e 100644 --- a/src/cpu/kernels/CpuTransposeKernel.cpp +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,7 +46,7 @@ namespace { unsigned int num_elems_processed(size_t element_size) { - switch(element_size) + switch (element_size) { case 1: return 8; @@ -81,10 +82,10 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -101,87 +102,121 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 8x8 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes)); - const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes)); - const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes)); - const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes)); - const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes)); - const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes)); - const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes)); - const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes)); - - // Transpose 2x2 - const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); - const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); - const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); - const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); - - // Transpose 4x4 - const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); - const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); - const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); - const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); - - // Transpose 8x8 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); - const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); - const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); - vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); - } - - // Compute left-over elements along the x dimension (1x8) - for(; x < window_end_x; ++x) - { - const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); - const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); - const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); - const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); - const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); - const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); - const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); - const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); - - uint8x8_t result = vdup_n_u8(0); - result = vset_lane_u8(val0, result, 0); - result = vset_lane_u8(val1, result, 1); - result = vset_lane_u8(val2, result, 2); - result = vset_lane_u8(val3, result, 3); - result = vset_lane_u8(val4, result, 4); - result = vset_lane_u8(val5, result, 5); - result = vset_lane_u8(val6, result, 6); - result = vset_lane_u8(val7, result, 7); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; - - vst1_u8(output.ptr() + dst_offset_in_bytes, result); - } - }, - input, output); + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x8_t row0 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes)); + const uint8x8_t row1 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes)); + const uint8x8_t row2 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes)); + const uint8x8_t row3 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes)); + const uint8x8_t row4 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes)); + const uint8x8_t row5 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes)); + const uint8x8_t row6 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes)); + const uint8x8_t row7 = + vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); + const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); + const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); + const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); + + // Transpose 4x4 + const uint16x4x2_t k0_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); + const uint16x4x2_t k1_u16 = + vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); + const uint16x4x2_t k2_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); + const uint16x4x2_t k3_u16 = + vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); + + // Transpose 8x8 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); + const uint32x2x2_t k2_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); + const uint32x2x2_t k3_u32 = + vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); + vst1_u8( + reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); + } + + // Compute left-over elements along the x dimension (1x8) + for (; x < window_end_x; ++x) + { + const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); + const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); + const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); + const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); + const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); + const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); + const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); + const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); + + uint8x8_t result = vdup_n_u8(0); + result = vset_lane_u8(val0, result, 0); + result = vset_lane_u8(val1, result, 1); + result = vset_lane_u8(val2, result, 2); + result = vset_lane_u8(val3, result, 3); + result = vset_lane_u8(val4, result, 4); + result = vset_lane_u8(val5, result, 5); + result = vset_lane_u8(val6, result, 6); + result = vset_lane_u8(val7, result, 7); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8(output.ptr() + dst_offset_in_bytes, result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -190,16 +225,18 @@ void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &wind Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint8_t val0 = *input.ptr(); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint8_t val0 = *input.ptr(); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; - *(output.ptr() + dst_offset_in_bytes) = val0; - }, - input, output); + *(output.ptr() + dst_offset_in_bytes) = val0; + }, + input, output); } } @@ -220,10 +257,10 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -240,61 +277,77 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); - const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); - - // Transpose 4x4 - const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); - const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0])); - vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0])); - vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1])); - vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) - { - const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint16x4_t result = vdup_n_u16(0); - result = vset_lane_u16(val0, result, 0); - result = vset_lane_u16(val1, result, 1); - result = vset_lane_u16(val2, result, 2); - result = vset_lane_u16(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; - - vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x4_t row0 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16x4_t row1 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16x4_t row2 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16x4_t row3 = + vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); + const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); + + // Transpose 4x4 + const uint32x2x2_t k0_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); + const uint32x2x2_t k1_u32 = + vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[0])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[0])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vreinterpret_u16_u32(k0_u32.val[1])); + vst1_u16( + reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vreinterpret_u16_u32(k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint16x4_t result = vdup_n_u16(0); + result = vset_lane_u16(val0, result, 0); + result = vset_lane_u16(val1, result, 1); + result = vset_lane_u16(val2, result, 2); + result = vset_lane_u16(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -303,16 +356,18 @@ void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } @@ -347,10 +402,10 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -367,102 +422,160 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 8x8 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - // Load - const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); - const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); - const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); - const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); - - // Transpose 2x4 - const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])}; - const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])}; - const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])}; - const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])}; - const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])}; - const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])}; - const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])}; - const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])}; - - // Transpose 2x2 - const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))}; - const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))}; - const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))}; - const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))}; - const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))}; - const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))}; - const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))}; - const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))}; - - // Swap blocks - const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])}; - const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])}; - const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])}; - const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])}; - const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])}; - const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])}; - const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])}; - const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])}; - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - // Store - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6); - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7); - } - - // Compute left-over elements (8x1) - for(; x < window_end_x; ++x) - { - const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); - const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); - const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); - const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); - - uint32x4_t result0 = vdupq_n_u32(0); - uint32x4_t result1 = vdupq_n_u32(0); - result0 = vsetq_lane_u32(val0, result0, 0); - result0 = vsetq_lane_u32(val1, result0, 1); - result0 = vsetq_lane_u32(val2, result0, 2); - result0 = vsetq_lane_u32(val3, result0, 3); - result1 = vsetq_lane_u32(val4, result1, 0); - result1 = vsetq_lane_u32(val5, result1, 1); - result1 = vsetq_lane_u32(val6, result1, 2); - result1 = vsetq_lane_u32(val7, result1, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1}); - } - }, - input, output); + // Compute 8x8 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load + const uint32x4x2_t row0 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4x2_t row1 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4x2_t row2 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4x2_t row3 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32x4x2_t row4 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32x4x2_t row5 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32x4x2_t row6 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32x4x2_t row7 = + vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); + + // Transpose 2x4 + const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), + vtrn2q_u32(row0.val[0], row1.val[0])}; + const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), + vtrn2q_u32(row0.val[1], row1.val[1])}; + const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), + vtrn2q_u32(row2.val[0], row3.val[0])}; + const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), + vtrn2q_u32(row2.val[1], row3.val[1])}; + const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), + vtrn2q_u32(row4.val[0], row5.val[0])}; + const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), + vtrn2q_u32(row4.val[1], row5.val[1])}; + const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), + vtrn2q_u32(row6.val[0], row7.val[0])}; + const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), + vtrn2q_u32(row6.val[1], row7.val[1])}; + + // Transpose 2x2 + const uint64x2x2_t k0_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))}; + const uint64x2x2_t k1_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))}; + const uint64x2x2_t k2_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))}; + const uint64x2x2_t k3_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))}; + const uint64x2x2_t k4_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))}; + const uint64x2x2_t k5_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))}; + const uint64x2x2_t k6_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))}; + const uint64x2x2_t k7_u64 = { + vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), + vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))}; + + // Swap blocks + const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), + vreinterpretq_u32_u64(k4_u64.val[0])}; + const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), + vreinterpretq_u32_u64(k5_u64.val[0])}; + const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), + vreinterpretq_u32_u64(k4_u64.val[1])}; + const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), + vreinterpretq_u32_u64(k5_u64.val[1])}; + const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), + vreinterpretq_u32_u64(k6_u64.val[0])}; + const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), + vreinterpretq_u32_u64(k7_u64.val[0])}; + const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), + vreinterpretq_u32_u64(k6_u64.val[1])}; + const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), + vreinterpretq_u32_u64(k7_u64.val[1])}; + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Store + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + col0); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + col1); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + col2); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + col3); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), + col4); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), + col5); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), + col6); + vst1q_u32_x2_( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), + col7); + } + + // Compute left-over elements (8x1) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x); + const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x); + const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x); + const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x); + + uint32x4_t result0 = vdupq_n_u32(0); + uint32x4_t result1 = vdupq_n_u32(0); + result0 = vsetq_lane_u32(val0, result0, 0); + result0 = vsetq_lane_u32(val1, result0, 1); + result0 = vsetq_lane_u32(val2, result0, 2); + result0 = vsetq_lane_u32(val3, result0, 3); + result1 = vsetq_lane_u32(val4, result1, 0); + result1 = vsetq_lane_u32(val5, result1, 1); + result1 = vsetq_lane_u32(val6, result1, 2); + result1 = vsetq_lane_u32(val7, result1, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1}); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -471,40 +584,42 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } #else // __aarch64__ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) { - const int window_step_x = 4; - const int window_step_y = 4; - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); - const int window_start_y = window.y().start(); - const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); - const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; - const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; - const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; // Check if we need a left-over loop for the y dimension bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); Window window_in(window); window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); - if(left_over_loop_y) + if (left_over_loop_y) { // Check if window_end_y_multiple_of is greater than window_start_y - if(window_end_y_multiple_of > window_start_y) + if (window_end_y_multiple_of > window_start_y) { window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); } @@ -521,60 +636,74 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Run the SIMD path if and only if the input is not a row-vector - if(in->info()->dimension(1) != 1) + if (in->info()->dimension(1) != 1) { Iterator input(in, window_in); - execute_window_loop(window_in, [&](const Coordinates & id) - { - // Compute 4x4 elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - - // Transpose 2x2 - const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); - const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); - const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); - const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - // Swap block 01 with block 10 and store - vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0])); - vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1])); - vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0])); - vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1])); - } - - // Compute left-over elements (1x4) - for(; x < window_end_x; ++x) + execute_window_loop( + window_in, + [&](const Coordinates &id) { - const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); - const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); - const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); - const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); - - uint32x4_t result = vdupq_n_u32(0); - result = vsetq_lane_u32(val0, result, 0); - result = vsetq_lane_u32(val1, result, 1); - result = vsetq_lane_u32(val2, result, 2); - result = vsetq_lane_u32(val3, result, 3); - - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; - - vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result); - } - }, - input, output); + // Compute 4x4 elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint32x4_t row0 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4_t row1 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4_t row2 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4_t row3 = + vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); + const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); + const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); + const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Swap block 01 with block 10 and store + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[0], k3_u32.val[0])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), + vcombine_u32(k0_u32.val[1], k3_u32.val[1])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[0], k1_u32.val[0])); + vst1q_u32( + reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), + vcombine_u32(k2_u32.val[1], k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for (; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint32x4_t result = vdupq_n_u32(0); + result = vsetq_lane_u32(val0, result, 0); + result = vsetq_lane_u32(val1, result, 1); + result = vsetq_lane_u32(val2, result, 2); + result = vsetq_lane_u32(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); } - if(left_over_loop_y) + if (left_over_loop_y) { window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); @@ -583,16 +712,18 @@ void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &win Iterator output(out, window_out); // Compute left-over elements along the y dimension (1x1) - execute_window_loop(window_in, [&](const Coordinates & id) - { - const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); + execute_window_loop( + window_in, + [&](const Coordinates &id) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); - // Compute destination address - const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; - *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; - }, - input, output); + *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); } } #endif // __aarch64__ @@ -616,7 +747,8 @@ void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size()); // Configure kernel window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped Coordinates coord; @@ -637,7 +769,7 @@ Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *d "Element size not supported"); // Validate configured destination - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); @@ -658,7 +790,7 @@ void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cons const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); auto dst = tensors.get_tensor(TensorType::ACL_DST); - switch(src->info()->element_size()) + switch (src->info()->element_size()) { case 1: transpose_8bit_elements(src, dst, window); diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h index cb85daeb40..e79a405677 100644 --- a/src/cpu/kernels/CpuTransposeKernel.h +++ b/src/cpu/kernels/CpuTransposeKernel.h @@ -54,7 +54,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp index 2ccc977995..297ba63826 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,7 +39,7 @@ namespace { TensorShape get_output_shape(const ITensorInfo *src, bool has_bias) { - TensorShape output_shape{ src->tensor_shape() }; + TensorShape output_shape{src->tensor_shape()}; output_shape.collapse(3); const size_t tmp_dim = output_shape[0]; @@ -54,20 +55,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, con //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type())); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1)); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2)); ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4])); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || + biases->dimension(1) != src->tensor_shape()[4])); } // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + get_output_shape(src, biases != nullptr)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -84,9 +87,7 @@ void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInf auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr)))); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, - biases, - dst)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst)); // Configure kernel Window window = calculate_max_window(*src, Steps()); @@ -122,44 +123,47 @@ void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, // Create iterators Iterator in(src, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Get column index - const int kernel_idx = id[3]; - const int kernel_idz = id[4]; - - // Setup pointers - const uint8_t *tmp_input_ptr = in.ptr(); - uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); - const uint8_t *curr_input_row_ptr = tmp_input_ptr; - const uint8_t *curr_input_depth_ptr = tmp_input_ptr; - - // Linearize volume - for(unsigned int d = 0; d < kernel_depth; ++d) + execute_window_loop( + window, + [&](const Coordinates &id) { - for(unsigned int j = 0; j < kernel_size_y; ++j) + // Get column index + const int kernel_idx = id[3]; + const int kernel_idz = id[4]; + + // Setup pointers + const uint8_t *tmp_input_ptr = in.ptr(); + uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); + const uint8_t *curr_input_row_ptr = tmp_input_ptr; + const uint8_t *curr_input_depth_ptr = tmp_input_ptr; + + // Linearize volume + for (unsigned int d = 0; d < kernel_depth; ++d) { - for(unsigned int i = 0; i < kernel_size_x; ++i) + for (unsigned int j = 0; j < kernel_size_y; ++j) { - std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); - tmp_input_ptr += input_stride_x; - tmp_output_ptr += output_stride_y; + for (unsigned int i = 0; i < kernel_size_x; ++i) + { + std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); + tmp_input_ptr += input_stride_x; + tmp_output_ptr += output_stride_y; + } + curr_input_row_ptr += input_stride_y; + tmp_input_ptr = curr_input_row_ptr; } - curr_input_row_ptr += input_stride_y; - tmp_input_ptr = curr_input_row_ptr; + curr_input_depth_ptr += input_stride_z; + curr_input_row_ptr = curr_input_depth_ptr; + tmp_input_ptr = curr_input_depth_ptr; } - curr_input_depth_ptr += input_stride_z; - curr_input_row_ptr = curr_input_depth_ptr; - tmp_input_ptr = curr_input_depth_ptr; - } - // Add bias - if(biases != nullptr) - { - std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size()); - } - }, - in); + // Add bias + if (biases != nullptr) + { + std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), + src->info()->element_size()); + } + }, + in); } const char *CpuWeightsReshapeKernel::name() const { @@ -167,4 +171,4 @@ const char *CpuWeightsReshapeKernel::name() const } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h index 1a260edc96..9310b3c784 100644 --- a/src/cpu/kernels/CpuWeightsReshapeKernel.h +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h @@ -82,7 +82,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; }; } // namespace kernels diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp index 818d878119..52e3f2549c 100644 --- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -28,8 +28,10 @@ namespace arm_compute { namespace cpu { -CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads) - : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads } +CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} { } @@ -49,24 +51,20 @@ void CpuWinogradConv2dTransformInputKernel::run_op(ITensorPack &tensors, const W const size_t input_row_stride = src_strides[height_idx] / element_size_in_bytes; const size_t input_col_stride = src_strides[width_idx] / element_size_in_bytes; const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes; - const auto input_nhwc_ptr = reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); - auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes()); + const auto input_nhwc_ptr = + reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); + auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() + + winograd_input_transform->info()->offset_first_element_in_bytes()); - _winograd_impl.input_transform->execute( - _conv_args, - input_nhwc_ptr, - input_batch_stride, - input_row_stride, - input_col_stride, - win_transf_ptr, - _winograd_impl.winograd_spec, - workspace->buffer(), - info.thread_id, - _nthreads); + _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride, + input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec, + workspace->buffer(), info.thread_id, _nthreads); } -CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads) - : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads } +CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads) + : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads} { } @@ -88,28 +86,21 @@ void CpuWinogradConv2dTransformOutputKernel::run_op(ITensorPack &tensors, const const size_t out_row_stride = dst_strides[height_idx] / element_size_in_bytes; const size_t out_col_stride = dst_strides[width_idx] / element_size_in_bytes; const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes; - const auto wout_transf_ptr = reinterpret_cast<const void *>(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes()); - auto dst_nhwc_ptr = reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes()); - void *biases_data_ptr = nullptr; - if(biases != nullptr) + const auto wout_transf_ptr = reinterpret_cast<const void *>( + winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes()); + auto dst_nhwc_ptr = + reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes()); + void *biases_data_ptr = nullptr; + if (biases != nullptr) { biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } // Output transform - _winograd_impl.output_transform->execute( - _conv_args, - wout_transf_ptr, - _winograd_impl.winograd_spec, - biases_data_ptr, - dst_nhwc_ptr, - out_batch_stride, - out_row_stride, - out_col_stride, - workspace->buffer(), - info.thread_id, - _nthreads); + _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr, + dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride, + workspace->buffer(), info.thread_id, _nthreads); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h index 0170dcae22..8a3b745e85 100644 --- a/src/cpu/kernels/CpuWinogradConv2dKernel.h +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h @@ -30,6 +30,7 @@ #include "arm_compute/core/Steps.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/assembly/winograd.hpp" #include "src/core/NEON/kernels/convolution/common/tensor.hpp" #include "src/cpu/ICpuKernel.h" @@ -53,7 +54,9 @@ public: /** Prevent instances of this class from being moved it contains references.*/ CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete; - CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads); + CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -83,7 +86,9 @@ public: /** Prevent instances of this class from being moved it contains references.*/ CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete; - CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads); + CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, + arm_conv::ConvolutionArgs &_c_args, + uint32_t nthreads); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; @@ -95,7 +100,7 @@ public: private: arm_conv::winograd::WinogradImpl &_winograd_impl; - const arm_conv::ConvolutionArgs &_conv_args; + const arm_conv::ConvolutionArgs &_conv_args; uint32_t _nthreads; }; diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp index e51b5b3423..ddc6dc24cd 100644 --- a/src/cpu/kernels/activation/generic/neon/fp16.cpp +++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp @@ -31,7 +31,7 @@ namespace cpu { namespace { -constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 }; +constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8}; } // namespace void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) @@ -40,4 +40,4 @@ void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLaye } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp index 2a3b8a0bfd..e558f8c73e 100644 --- a/src/cpu/kernels/activation/generic/neon/fp32.cpp +++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp @@ -29,7 +29,7 @@ namespace cpu { namespace { -constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 }; +constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4}; } // namespace void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h index 05885d8476..afeb6f7f3d 100644 --- a/src/cpu/kernels/activation/generic/neon/impl.h +++ b/src/cpu/kernels/activation/generic/neon/impl.h @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -56,10 +57,14 @@ inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &ma #endif /* __aarch64__ */ template <typename T, const ActFpImplParams &P> -void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void fp_neon_activation_impl(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ - using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + using ExactTagType = + typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; constexpr int window_step_x = P.step_x; const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -72,12 +77,12 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL // to prevent NAN values caused by zeros in inputs to SQRT. // In case of aarh64, we call vsqrt directly, so we don't use delta. #ifndef __aarch64__ - const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {}); + const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{}); #else /* #ifndef __aarch64__ */ - const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType {}); + const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{}); const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{}); #endif /* __aarch64__ */ - const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {}); + const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}); const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}); const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{}); const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{}); @@ -88,143 +93,154 @@ void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationL const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}); const auto a = static_cast<T>(act_info.a()); const auto b = static_cast<T>(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); - switch(act) + const auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp; + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = wrapper::vabs(vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = wrapper::vmla(vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = wrapper::vmax(const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: + const auto vin = wrapper::vloadq(input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, + wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, + wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: #ifdef __aarch64__ - tmp = wrapper::vsqrt(vin); + tmp = wrapper::vsqrt(vin); #else /* __aarch64__ */ { const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{})); - tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); - tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); + tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); + tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); } #endif /* __aarch64__ */ - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = wrapper::vmul(vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin)))))); - break; + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_6, + wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd( + const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin)))))); + break; #ifdef __aarch64__ - case ActivationLayerInfo::ActivationFunction::GELU: - tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2))))); - break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = wrapper::vmul( + vin, + wrapper::vmul(const_inv_2, + wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2))))); + break; #endif /* __aarch64__ */ - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T in = *(reinterpret_cast<const T *>(input_ptr + x)); - T tmp; - switch(act) + // Compute left-over elements + for (; x < window_end_x; ++x) { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = std::abs(in); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = a * in + b; - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = std::max<T>(static_cast<T>(0), in); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = std::min<T>(a, std::max<T>(b, in)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = (in > 0) ? in : a * in; - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in)); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = (in >= 0) ? in : a * (std::exp(in) - 1); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = std::sqrt(in); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = in * in; - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = a * std::tanh(b * in); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = in; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = in / (static_cast<T>(1) + std::exp(-a * in)); - break; - case ActivationLayerInfo::ActivationFunction::GELU: - tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); + const T in = *(reinterpret_cast<const T *>(input_ptr + x)); + T tmp; + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = std::max<T>(static_cast<T>(0), in); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = std::min<T>(a, std::max(static_cast<T>(0), in)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min<T>(a, std::max<T>(b, in)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = (in >= 0) ? in : a * (std::exp(in) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = in; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = in / (static_cast<T>(1) + std::exp(-a * in)); + break; + case ActivationLayerInfo::ActivationFunction::GELU: + tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp index c973e964e4..f289c80d4b 100644 --- a/src/cpu/kernels/activation/generic/neon/lut.cpp +++ b/src/cpu/kernels/activation/generic/neon/lut.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -33,19 +34,22 @@ namespace cpu #ifdef __aarch64__ void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && + src->info()->data_type() != DataType::QASYMM8_SIGNED); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); - }, - input, output); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); } #endif // __aarch64__ } // namespace cpu diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp index e7c146e46f..1451301ea2 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -38,7 +39,10 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); @@ -85,206 +89,222 @@ void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr()); - wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp; + wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_u8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // Perform activation - tmp = vminq_u8(va, vmaxq_u8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_u8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - - const uint32x4x4_t pos_mask = + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) { - { + const auto vin_deq = vdequantize(vin, qi_in); + + const uint32x4x4_t pos_mask = {{ wrapper::vcgt(vin_deq.val[0], vconst_0_f32), wrapper::vcgt(vin_deq.val[1], vconst_0_f32), wrapper::vcgt(vin_deq.val[2], vconst_0_f32), wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; + }}; - const float32x4x4_t tmp_dep = - { - { + const float32x4x4_t tmp_dep = {{ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; + }}; - tmp = vquantize(tmp_dep, qi_out); - } + tmp = vquantize(tmp_dep, qi_out); + } #else // #ifndef __aarch64__ - else if (act == ActivationLayerInfo::ActivationFunction::GELU) - { - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::GELU) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))), - } - }; - // Re-quantize to new output space - tmp = vquantize(tmp_dep, qi_out); - } + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul(vin_deq.val[0], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[0], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[1], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[1], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[2], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[2], const_inv_sqrt_2))))), + wrapper::vmul(vin_deq.val[3], + wrapper::vmul(const_inv_2, + wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul( + vin_deq.val[3], const_inv_sqrt_2))))), + }}; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } #endif // __aarch64__ - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); - qasymm8_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); - } + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o)); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::GELU) - { - float tmp_f = dequantize_qasymm8(in, qi_in); - tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f)); - tmp = quantize_qasymm8(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::GELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } #endif // __aarch64__ - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp index 52c396459b..a2f588245a 100644 --- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -36,7 +37,10 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 16; const auto window_start_x = static_cast<int>(window.x().start()); @@ -76,191 +80,195 @@ void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti float32x4_t vs = vdupq_n_f32(s); float32x4_t vo = vdupq_n_f32(o); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr()); - wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp; + wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = vmaxq_s8(vconst_0, vin); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // Perform activation - tmp = vminq_s8(va, vmaxq_s8(vb, vin)); - // Re-quantize to new output space - tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); - } + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_s8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - const float32x4x4_t tmp_dep = + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - { - wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), - wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), - wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), - wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), - } - }; - // Re-quantize to new output space - tmp = vquantize_signed(tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - const auto vin_deq = vdequantize(vin, qi_in); + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = {{ + wrapper::vmul( + vin_deq.val[0], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul( + vin_deq.val[1], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul( + vin_deq.val[2], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul( + vin_deq.val[3], + wrapper::vmul( + const_inv_6_f32, + wrapper::vmin(const_6_f32, + wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + }}; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); #ifdef __aarch64__ - const uint32x4x4_t pos_mask = - { - { + const uint32x4x4_t pos_mask = {{ wrapper::vcgtz(vin_deq.val[0]), wrapper::vcgtz(vin_deq.val[1]), wrapper::vcgtz(vin_deq.val[2]), wrapper::vcgtz(vin_deq.val[3]), - } - }; + }}; #else // __aarch64__ - const uint32x4x4_t pos_mask = - { - { + const uint32x4x4_t pos_mask = {{ wrapper::vcgt(vin_deq.val[0], vconst_0_f32), wrapper::vcgt(vin_deq.val[1], vconst_0_f32), wrapper::vcgt(vin_deq.val[2], vconst_0_f32), wrapper::vcgt(vin_deq.val[3], vconst_0_f32), - } - }; + }}; #endif // __aarch64__ - const float32x4x4_t tmp_dep = - { - { + const float32x4x4_t tmp_dep = {{ wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), - } - }; + }}; - tmp = vquantize_signed(tmp_dep, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + tmp = vquantize_signed(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); - qasymm8_signed_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - tmp = std::max(const_0, in); - tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - tmp = std::min(a, std::max(const_0, in)); - tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Compute left-over elements + for (; x < window_end_x; ++x) { - tmp = std::min(a, std::max(b, in)); - tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); - } + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o)); + } #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead. - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } #endif // __aarch64__ - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; - tmp = quantize_qasymm8_signed(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp index 2aea6cba3c..891646ea00 100644 --- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp @@ -21,11 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/NESymm.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -38,7 +39,10 @@ namespace arm_compute { namespace cpu { -void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void neon_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { constexpr int window_step_x = 8; const auto window_start_x = static_cast<int>(window.x().start()); @@ -59,103 +63,94 @@ void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL const float a_f32 = act_info.a(); const float b_f32 = act_info.b(); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr()); - wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp; - ARM_COMPUTE_UNUSED(tmp); + wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp; + ARM_COMPUTE_UNUSED(tmp); - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vin = wrapper::vloadq(input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + const auto vin = wrapper::vloadq(input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) { - { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{ wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } + }}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // De-quantize - const auto vin_deq = vdequantize_int16(vin, qi_in.scale); - // Perform activation - const float32x4x2_t tmp_dep = + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { - { - wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])), - wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1])) - } - }; - // Re-quantize to new output space - tmp = vquantize_int16(tmp_dep, qi_out.scale); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])), + wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}}; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); } - wrapper::vstore(output_ptr + x, tmp); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x)); - qsymm16_t tmp = 0; - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = 1.f / (1.f + std::exp(-tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) + // Compute left-over elements + for (; x < window_end_x; ++x) { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - float tmp_f = dequantize_qsymm16(in, qi_in.scale); - tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f)); - tmp = quantize_qsymm16(tmp_f, qi_out); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); + qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x)); + qsymm16_t tmp = 0; + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; } - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp index 4757c60d8f..97399e01e0 100644 --- a/src/cpu/kernels/activation/generic/sve/fp16.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp @@ -29,11 +29,11 @@ #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include <cmath> -#include <cstddef> - #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> +#include <cmath> +#include <cstddef> namespace arm_compute { @@ -59,77 +59,87 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer const auto va = svdup_n_f16(act_info.a()); const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); - svfloat16_t tmp; + svfloat16_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_f16(pg, input_ptr + x); - switch(act) + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f16_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f16_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f16_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f16_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin)))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f16(pg, output_ptr + x, tmp); + const auto vin = svld1_f16(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f16_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f16_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), + svmax_f16_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, + svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f16_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f16_z( + pg, vin, + svmul_f16_z( + pg, const_inv_6, + svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f16_z( + pg, vin, + svinv_f16_z(pg, svadd_f16_z(pg, const_1, + svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f16(pg, output_ptr + x, tmp); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp index 87f04c255a..d1b075d52c 100644 --- a/src/cpu/kernels/activation/generic/sve/fp32.cpp +++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp @@ -26,13 +26,13 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#include <arm_sve.h> - namespace arm_compute { namespace cpu @@ -58,78 +58,89 @@ void sve_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayer const auto va = svdup_n_f32(act_info.a()); const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); - svfloat32_t tmp; + svfloat32_t tmp; - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do - { - const auto vin = svld1_f32(pg, input_ptr + x); - switch(act) + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do { - case ActivationLayerInfo::ActivationFunction::ABS: - tmp = svabs_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::LINEAR: - tmp = svmla_f32_z(pg, vb, va, vin); - break; - case ActivationLayerInfo::ActivationFunction::LOGISTIC: - tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::RELU: - tmp = svmax_f32_z(pg, const_0, vin); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); - break; - case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: - tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0)); - break; - case ActivationLayerInfo::ActivationFunction::SOFT_RELU: - tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)))); - break; - case ActivationLayerInfo::ActivationFunction::ELU: - tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); - break; - case ActivationLayerInfo::ActivationFunction::SQRT: - tmp = svsqrt_f32_z(pg, vin); - break; - case ActivationLayerInfo::ActivationFunction::SQUARE: - tmp = svmul_f32_z(pg, vin, vin); - break; - case ActivationLayerInfo::ActivationFunction::TANH: - tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); - break; - case ActivationLayerInfo::ActivationFunction::IDENTITY: - tmp = vin; - break; - case ActivationLayerInfo::ActivationFunction::HARD_SWISH: - tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); - break; - case ActivationLayerInfo::ActivationFunction::SWISH: - tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin)))))); - break; - default: - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - svst1_f32(pg, output_ptr + x, tmp); + const auto vin = svld1_f32(pg, input_ptr + x); + switch (act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f32_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f32_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), + svmax_f32_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, + svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, + svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f32_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f32_z( + pg, vin, + svmul_f32_z( + pg, const_inv_6, + svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); + break; + case ActivationLayerInfo::ActivationFunction::SWISH: + tmp = svmul_f32_z( + pg, vin, + svinv_f32_z(pg, svadd_f32_z(pg, const_1, + svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin)))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f32(pg, output_ptr + x, tmp); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp index d65de8d649..2ed667debf 100644 --- a/src/cpu/kernels/activation/generic/sve2/lut.cpp +++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -33,19 +34,22 @@ namespace cpu #ifdef __aarch64__ void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) { - ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED); + ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && + src->info()->data_type() != DataType::QASYMM8_SIGNED); const auto window_end_x = window.x().end(); Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = input.ptr(); - auto output_ptr = output.ptr(); - lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); - }, - input, output); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = input.ptr(); + auto output_ptr = output.ptr(); + lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr); + }, + input, output); } #endif // __aarch64__ } // namespace cpu diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp index bc9bc7aa3c..7efa9e4b72 100644 --- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp @@ -26,18 +26,21 @@ #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include <cmath> -#include <cstddef> - #include "src/core/NEON/SVEAsymm.h" #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> +#include <cmath> +#include <cstddef> namespace arm_compute { namespace cpu { -void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -61,7 +64,7 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL // Initialise scale/offset for re-quantization bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) { requant = false; } @@ -78,139 +81,160 @@ void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationL const auto vo_s32 = svdup_n_s32(o_s32); // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - svuint8_t tmp; + svuint8_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_u8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_u8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = svmla_qasymm8_z(pg, tmp, vs, vo); - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); - - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); - - // Re-quantize to new output space - tmp = svquantize_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) - { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = svcreate4_s32( - svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), - svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), - svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin)))); - - // Compare elements to input offset - if(qi_in.scale >= 0) + const auto vin = svld1_u8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmax_u8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; } - - // Multiply negative elements and requantize if necessary - if(requant) + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = svmla_qasymm8_z(pg, tmp, vs, vo); + } + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin)))); + + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); } else { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + ARM_COMPUTE_ERROR("Unsupported activation function"); } - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); - - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } - - svst1_u8(pg, output_ptr + x, tmp); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); + svst1_u8(pg, output_ptr + x, tmp); - } - while(svptest_any(svptrue_b8(), pg)); + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); - }, - input, output); + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp index d20684f54d..e4667522dd 100644 --- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp @@ -24,20 +24,23 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" - -#include <cmath> -#include <cstddef> #include "src/core/NEON/SVEAsymm.h" #include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + #include <arm_sve.h> +#include <cmath> +#include <cstddef> namespace arm_compute { namespace cpu { -void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qasymm8_signed_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -65,7 +68,7 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti // Initialise scale/offset for re-quantization bool requant = true; - if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) { requant = false; } @@ -82,151 +85,190 @@ void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const Acti const auto vo_s32 = svdup_n_s32(o_s32); // Initialise scale/offset for re-quantization for leaky relu - int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); - int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), - arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - svint8_t tmp; + svint8_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do - { - const auto vin = svld1_s8(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::RELU) - { - // Perform activation - tmp = svmax_s8_z(pg, vconst_0, vin); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // Perform activation - tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); - // Re-quantize to new output space - tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; - } - else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - { - // De-quantize - const auto vin_deq = svdequantize_z(pg, vin, qi_in); - // Perform activation - const svfloat32x4_t tmp_dep = svcreate4_f32( - svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), - svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32)))))); - // Re-quantize to new output space - tmp = svquantize_signed_z(pg, tmp_dep, qi_out); - } - else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do { - svbool_t p0, p1, p2, p3; - svint32x4_t tmp_dep; - - // Expand to int32 - const svint32x4_t vin_s32 = svcreate4_s32( - svmovlb_s32(svmovlb_s16(vin)), - svmovlt_s32(svmovlb_s16(vin)), - svmovlb_s32(svmovlt_s16(vin)), - svmovlt_s32(svmovlt_s16(vin))); - - // Compare elements to input offset - if(qi_in.scale >= 0) + const auto vin = svld1_s8(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::RELU) { - p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmax_s8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { - p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); - p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); - p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); - p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - - // Multiply negative elements and requantize if necessary - if(requant) + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), - svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; } - else + else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) { - tmp_dep = svcreate4_s32( - svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), - svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = svcreate4_f32( + svmul_f32_z(pg, svget4_f32(vin_deq, 0), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 0), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 1), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 2), + const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), + svmul_f32_z(pg, const_inv_6_f32, + svmin_f32_z(pg, const_6_f32, + svmax_f32_z(pg, const_0_f32, + svadd_f32_z(pg, svget4_f32(vin_deq, 3), + const_3_f32)))))); + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); } + else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; - // Convert uint32 vectors to uint16 vectors (with saturation) - const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); - const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + // Expand to int32 + const svint32x4_t vin_s32 = + svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)), + svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin))); - // convert uint16 vectors to uint8 vectors (with saturation) - tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); - } - else - { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } + // Compare elements to input offset + if (qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if (requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), + svsel(p0, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), + svsel(p1, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), + svsel(p2, vs_leaky_s32, vs_s32)), + 8), + svasr_n_s32_m(pg, + svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), + svsel(p3, vs_leaky_s32, vs_s32)), + 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } - svst1_s8(pg, output_ptr + x, tmp); + svst1_s8(pg, output_ptr + x, tmp); - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b8(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp index 5154fac8a7..f955893307 100644 --- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp +++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp @@ -21,24 +21,27 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" -#include <cmath> -#include <cstddef> - #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/SVESymm.h" + #include <arm_sve.h> +#include <cmath> +#include <cstddef> namespace arm_compute { namespace cpu { -void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +void sve2_qsymm16_activation(const ITensor *src, + ITensor *dst, + const ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); @@ -56,62 +59,70 @@ void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationL const auto va_f32 = svdup_n_f32(act_info.a()); const auto vb_f32 = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + execute_window_loop( + win_collapsed, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - svint16_t tmp; + svint16_t tmp; - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do - { - const auto vin = svld1_s16(pg, input_ptr + x); - if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), - svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1)))))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::TANH) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), - svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32)))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - { - // De-quantize - auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); - // Perform activation - const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))), - svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1)))); - // Re-quantize to new output space - tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); - } - else + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do { - ARM_COMPUTE_ERROR("Unsupported activation function"); - } + const auto vin = svld1_s16(pg, input_ptr + x); + if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), + svdiv_f32_z( + pg, vconst_1, + svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1)))))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = svcreate2_f32( + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))), + svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1)))); + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } - svst1_s16(pg, output_ptr + x, tmp); + svst1_s16(pg, output_ptr + x, tmp); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp index fca7b2cd9f..e7679c14e3 100644 --- a/src/cpu/kernels/add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/add/generic/neon/fp16.cpp @@ -30,10 +30,11 @@ namespace arm_compute { namespace cpu { -void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon<float16_t>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp index 1f599b1968..11a970bef4 100644 --- a/src/cpu/kernels/add/generic/neon/fp32.cpp +++ b/src/cpu/kernels/add/generic/neon/fp32.cpp @@ -28,9 +28,10 @@ namespace arm_compute { namespace cpu { -void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon<float>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp index 2dde13544a..34938cc4c4 100644 --- a/src/cpu/kernels/add/generic/neon/impl.cpp +++ b/src/cpu/kernels/add/generic/neon/impl.cpp @@ -23,8 +23,10 @@ */ #include "src/cpu/kernels/add/generic/neon/impl.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -40,7 +42,10 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true); } -bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition) +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition) { const auto iq0 = src0->quantization_info().uniform(); const auto iq1 = src1->quantization_info().uniform(); @@ -49,7 +54,7 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI const auto scale0 = iq0.scale / oq.scale; const auto scale1 = iq1.scale / oq.scale; - if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f) + if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f) { // The scale factor cannot be stored as 5.11 signed fixed-point number. return false; @@ -57,9 +62,10 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset); - const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset)); + const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) + : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset)); - if(max_acc > 1048575.f) // 2^20 - 1 + if (max_acc > 1048575.f) // 2^20 - 1 { // It might not be possible to store the result as 21.11 signed fixed-point number. return false; @@ -69,13 +75,19 @@ bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorI } template <typename ScalarType> -void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/); } template <typename ScalarType> -void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -103,7 +115,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso const auto oq_info = dst->info()->quantization_info().uniform(); const auto in0_scale = iq0_info.scale / oq_info.scale; const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale)); - const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset); + const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset); constexpr float _2pow11 = 2048; const auto in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11)); @@ -112,7 +124,7 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso constexpr uint8_t shift_amount_remainder = 3; - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Prefix: a = non-broadcast, b = broadcast. @@ -138,68 +150,75 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); - const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); - - const auto b_val = *b_ptr; - const auto b_scaled = b_scale * b_val; - const auto b_scaled_21p11 = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11)); - const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11; - const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag()); + win, + [&](const Coordinates &) + { + const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr()); + const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + const auto b_val = *b_ptr; + const auto b_scaled = b_scale * b_val; + const auto b_scaled_21p11 = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11)); + const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11; + const auto b_vscaled_offseted_21p11 = + wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag()); #ifndef __aarch64__ - const auto b_scaled_offseted = b_scaled + offset; + const auto b_scaled_offseted = b_scaled + offset; #endif // __aarch64__ - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the input. - const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); - - // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. - const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); - const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); - - // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset. - // Widen and store the result in 32-bit integer. - const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11); - const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11); - const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11); - const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11); - - // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. - const auto vout_8p8_0 = wrapper::vcombine( - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); - const auto vout_8p8_1 = wrapper::vcombine( - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); - - // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<8>(vout_8p8_0), - wrapper::vqrshrn<8>(vout_8p8_1)); - - // Store the result. - wrapper::vstore(out_ptr + x, vout_8p0); - } - - // Process the left-over elements. - for(; x < window_end_x; ++x) - { + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the input. + const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x); + + // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness. + const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0))); + const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0))); + + // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset. + // Widen and store the result in 32-bit integer. + const auto vout_21p11_00 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11)); + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>( + int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11)); #else // __aarch64__ - out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted)); + out_ptr[x] = utility::clamp<int, ScalarType>( + support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted)); #endif // __aarch64__ - } - }, - b_input_it, a_input_it, out_it); + } + }, + b_input_it, a_input_it, out_it); } else { @@ -216,70 +235,85 @@ void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITenso Iterator out_it(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); - const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); - - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Load the inputs. - const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); - const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); - - // Widen the input elements to signed 16-bit regardless of the input signedness. - const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); - const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); - const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); - const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); - - // Multiply the input elements by the scale factor and add the offset. - // Widen and store the result in 32-bit integer. - const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11); - const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11); - const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11); - const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11); - - const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11); - const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11); - const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11); - const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11); - - // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. - const auto vout_8p8_0 = wrapper::vcombine( - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); - const auto vout_8p8_1 = wrapper::vcombine( - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), - wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); - - // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. - const auto vout_8p0 = wrapper::vcombine( - wrapper::vqrshrn<8>(vout_8p8_0), - wrapper::vqrshrn<8>(vout_8p8_1)); - - // Store the result. - wrapper::vstore(out_ptr + x, vout_8p0); - } - - // Process the left-over elements. - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { + const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr()); + const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()); + + int x = window_start_x; + + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Load the inputs. + const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x); + const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x); + + // Widen the input elements to signed 16-bit regardless of the input signedness. + const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0))); + const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0))); + const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0))); + const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0))); + + // Multiply the input elements by the scale factor and add the offset. + // Widen and store the result in 32-bit integer. + const auto vscaled0_offseted_21p11_00 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_01 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11); + const auto vscaled0_offseted_21p11_10 = + wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11); + const auto vscaled0_offseted_21p11_11 = + wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11); + + const auto vout_21p11_00 = + wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_01 = + wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11); + const auto vout_21p11_10 = + wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11); + const auto vout_21p11_11 = + wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11); + + // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result. + const auto vout_8p8_0 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01)); + const auto vout_8p8_1 = + wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10), + wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11)); + + // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result. + const auto vout_8p0 = + wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1)); + + // Store the result. + wrapper::vstore(out_ptr + x, vout_8p0); + } + + // Process the left-over elements. + for (; x < window_end_x; ++x) + { #ifdef __aarch64__ - out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11)); + out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>( + int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11)); #else // __aarch64__ - out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset)); + out_ptr[x] = utility::clamp<int, ScalarType>( + support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset)); #endif // __aarch64__ - } - }, - in0_it, in1_it, out_it); + } + }, + in0_it, in1_it, out_it); } } -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -304,7 +338,7 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -324,63 +358,64 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = non_broadcast_input.ptr(); - const auto output_ptr = output.ptr(); - - const auto broadcast_value = *broadcast_input.ptr(); - const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); - const auto bfs = float(broadcast_value) * bf_scale + offset; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = non_broadcast_input.ptr(); + const auto output_ptr = output.ptr(); + + const auto broadcast_value = *broadcast_input.ptr(); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; - const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); - const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); - const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); - const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); - const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); - const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(af_0); - rf_1 = vcvtnq_s32_f32(af_1); - rf_2 = vcvtnq_s32_f32(af_2); - rf_3 = vcvtnq_s32_f32(af_3); + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(af_0); - rf_1 = vcvtq_s32_f32(af_1); - rf_2 = vcvtq_s32_f32(af_2); - rf_3 = vcvtq_s32_f32(af_3); + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); #endif //__aarch64__ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; #ifdef __aarch64__ - output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - broadcast_input, non_broadcast_input, output); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -397,72 +432,78 @@ void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst const auto voffset = vdupq_n_f32(offset); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = input1.ptr(); - const auto input2_ptr = input2.ptr(); - const auto output_ptr = output.ptr(); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const uint8x16_t a = vld1q_u8(input1_ptr + x); - const uint8x16_t b = vld1q_u8(input2_ptr + x); - - const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); - const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); - const auto b_u16_0 = vmovl_u8(vget_low_u8(b)); - const auto b_u16_1 = vmovl_u8(vget_high_u8(b)); - - const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); - const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); - const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); - const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); - - const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2); - const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2); - const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2); - const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto input1_ptr = input1.ptr(); + const auto input2_ptr = input2.ptr(); + const auto output_ptr = output.ptr(); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const auto a_u16_0 = vmovl_u8(vget_low_u8(a)); + const auto a_u16_1 = vmovl_u8(vget_high_u8(a)); + const auto b_u16_0 = vmovl_u8(vget_low_u8(b)); + const auto b_u16_1 = vmovl_u8(vget_high_u8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(bf_0); - rf_1 = vcvtnq_s32_f32(bf_1); - rf_2 = vcvtnq_s32_f32(bf_2); - rf_3 = vcvtnq_s32_f32(bf_3); + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(bf_0); - rf_1 = vcvtq_s32_f32(bf_1); - rf_2 = vcvtq_s32_f32(bf_2); - rf_3 = vcvtq_s32_f32(bf_3); + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); #endif //__aarch64__ - const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); - } + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; #ifdef __aarch64__ - output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - input1, input2, output); + } + }, + input1, input2, output); } } -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition) +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition) { ARM_COMPUTE_UNUSED(policy); @@ -487,7 +528,7 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale)); const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -507,63 +548,64 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); - const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); - const auto bfs = float(broadcast_value) * bf_scale + offset; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); - const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto bf = vdupq_n_f32(float(broadcast_value) * scale2 + offset); + const auto bfs = float(broadcast_value) * bf_scale + offset; - const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); - const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); - const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); - const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + + const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(af_0); - rf_1 = vcvtnq_s32_f32(af_1); - rf_2 = vcvtnq_s32_f32(af_2); - rf_3 = vcvtnq_s32_f32(af_3); + rf_0 = vcvtnq_s32_f32(af_0); + rf_1 = vcvtnq_s32_f32(af_1); + rf_2 = vcvtnq_s32_f32(af_2); + rf_3 = vcvtnq_s32_f32(af_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(af_0); - rf_1 = vcvtq_s32_f32(af_1); - rf_2 = vcvtq_s32_f32(af_2); - rf_3 = vcvtq_s32_f32(af_3); + rf_0 = vcvtq_s32_f32(af_0); + rf_1 = vcvtq_s32_f32(af_1); + rf_2 = vcvtq_s32_f32(af_2); + rf_3 = vcvtq_s32_f32(af_3); #endif //__aarch64__ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs; #ifdef __aarch64__ - output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - broadcast_input, non_broadcast_input, output); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -580,79 +622,102 @@ void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITens const auto voffset = vdupq_n_f32(offset); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const int8x16_t a = vld1q_s8(input1_ptr + x); - const int8x16_t b = vld1q_s8(input2_ptr + x); - - const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); - const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); - const auto b_s16_0 = vmovl_s8(vget_low_s8(b)); - const auto b_s16_1 = vmovl_s8(vget_high_s8(b)); - - const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); - const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); - const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); - const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); - - const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2); - const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2); - const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2); - const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; - int32x4_t rf_2{}; - int32x4_t rf_3{}; + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(input1_ptr + x); + const int8x16_t b = vld1q_s8(input2_ptr + x); + + const auto a_s16_0 = vmovl_s8(vget_low_s8(a)); + const auto a_s16_1 = vmovl_s8(vget_high_s8(a)); + const auto b_s16_0 = vmovl_s8(vget_low_s8(b)); + const auto b_s16_1 = vmovl_s8(vget_high_s8(b)); + + const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1); + const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1); + const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1); + const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1); + + const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2); + const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2); + const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2); + const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(bf_0); - rf_1 = vcvtnq_s32_f32(bf_1); - rf_2 = vcvtnq_s32_f32(bf_2); - rf_3 = vcvtnq_s32_f32(bf_3); + rf_0 = vcvtnq_s32_f32(bf_0); + rf_1 = vcvtnq_s32_f32(bf_1); + rf_2 = vcvtnq_s32_f32(bf_2); + rf_3 = vcvtnq_s32_f32(bf_3); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(bf_0); - rf_1 = vcvtq_s32_f32(bf_1); - rf_2 = vcvtq_s32_f32(bf_2); - rf_3 = vcvtq_s32_f32(bf_3); + rf_0 = vcvtq_s32_f32(bf_0); + rf_1 = vcvtq_s32_f32(bf_1); + rf_2 = vcvtq_s32_f32(bf_2); + rf_3 = vcvtq_s32_f32(bf_3); #endif //__aarch64__ - const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); - const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); - vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); - } + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset; #ifdef __aarch64__ - output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result)); #else // __aarch64__ - output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); + output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result)); #endif // __aarch64__ - } - }, - input1, input2, output); + } + }, + input1, input2, output); } } -template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); - -template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); -template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); - -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +template void add_q8_neon_fixedpoint<int8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_q8_neon_fixedpoint<uint8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); + +template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h index fb786c5bc1..faa99baffe 100644 --- a/src/cpu/kernels/add/generic/neon/impl.h +++ b/src/cpu/kernels/add/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -35,7 +36,8 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>; @@ -53,7 +55,7 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -69,31 +71,36 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + win, + [&](const Coordinates &) { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; - } - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) + ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) + : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) + ? wrapper::add_sat(broadcast_value, non_broadcast_v) + : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -106,31 +113,34 @@ void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = + (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = + (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); } } @@ -138,17 +148,36 @@ bool add_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition); - -void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); - -void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + bool is_addition); + +void add_sub_qasymm8_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); + +void add_sub_qasymm8_signed_neon(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); template <typename ScalarType> -void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +void add_q8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); template <typename ScalarType> -void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition); +void add_sub_q8_neon_fixedpoint(const ITensor *src0, + const ITensor *src1, + ITensor *dst, + const ConvertPolicy &policy, + const Window &window, + bool is_addition); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp index 5698d6d552..f0bcebc9d2 100644 --- a/src/cpu/kernels/add/generic/neon/integer.cpp +++ b/src/cpu/kernels/add/generic/neon/integer.cpp @@ -28,19 +28,22 @@ namespace arm_compute { namespace cpu { -void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_u8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon<uint8_t>(src0, src1, dst, policy, window); } -void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon<int16_t>(src0, src1, dst, policy, window); } -void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s32_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_neon<int32_t>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp index 69cca956c8..8195d229d9 100644 --- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp @@ -23,15 +23,17 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp index dfdf8fe85b..7e23096239 100644 --- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp @@ -23,15 +23,17 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp index e76e408d6e..ac2de0557a 100644 --- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp @@ -25,14 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute { namespace cpu { -void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +59,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -65,7 +67,7 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -74,48 +76,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); - const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #endif //__aarch64__ - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -127,48 +131,50 @@ void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); - - const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); - const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); - const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); - const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); - - int32x4_t rf_0{}; - int32x4_t rf_1{}; + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; #ifdef __aarch64__ - rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #else //__aarch64__ - rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); - rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); #endif //__aarch64__ - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); - vst1q_s16(output_ptr + x, pa); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp index 581f3abded..01dfe6c44b 100644 --- a/src/cpu/kernels/add/generic/sve/fp16.cpp +++ b/src/cpu/kernels/add/generic/sve/fp16.cpp @@ -31,10 +31,11 @@ namespace arm_compute { namespace cpu { -void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve<float16_t>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp index b37799113a..56771a5411 100644 --- a/src/cpu/kernels/add/generic/sve/fp32.cpp +++ b/src/cpu/kernels/add/generic/sve/fp32.cpp @@ -24,15 +24,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/kernels/add/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_fp32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve<float>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp index e8606436fd..ca850fcef4 100644 --- a/src/cpu/kernels/add/generic/sve/impl.cpp +++ b/src/cpu/kernels/add/generic/sve/impl.cpp @@ -23,17 +23,21 @@ */ #include "src/cpu/kernels/add/generic/sve/impl.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include <arm_sve.h> namespace arm_compute { namespace cpu { template <typename ScalarType> -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { const auto all_true_pg = wrapper::svptrue<ScalarType>(); const auto window_start_x = static_cast<int>(window.x().start()); @@ -53,7 +57,7 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); Iterator output(dst, window); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -68,28 +72,30 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) + : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -101,35 +107,41 @@ void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto val1 = svld1(pg, input1_ptr + x); - const auto val2 = svld1(pg, input2_ptr + x); - const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); - svst1(pg, output_ptr + x, res); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); -template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<float>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<uint8_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int16_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int32_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<float16_t>( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h index 0136f14246..6a95d66826 100644 --- a/src/cpu/kernels/add/generic/sve/impl.h +++ b/src/cpu/kernels/add/generic/sve/impl.h @@ -33,7 +33,8 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +void add_same_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); } // namespace cpu } // namespace arm_compute #endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp index 3642dccd7b..4d17f2adbd 100644 --- a/src/cpu/kernels/add/generic/sve/integer.cpp +++ b/src/cpu/kernels/add/generic/sve/integer.cpp @@ -24,25 +24,29 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/kernels/add/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_u8_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve<uint8_t>(src0, src1, dst, policy, window); } -void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s16_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve<int16_t>(src0, src1, dst, policy, window); } -void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_s32_sve( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { return add_same_sve<int32_t>(src0, src1, dst, policy, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp index 1dec214aa0..40add9d51b 100644 --- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include <arm_sve.h> namespace arm_compute { namespace cpu { -void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -58,7 +61,7 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto voffseto = svdup_n_f32(oq_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -78,48 +81,89 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); - const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z( + pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), + voffset2)), + vscale2); - do - { - const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + do + { + const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -136,45 +180,82 @@ void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto voffset1 = svdup_n_s32(iq1_info.offset); const auto voffset2 = svdup_n_s32(iq2_info.offset); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_u8(pg, input1_ptr + x); - const auto b = svld1_u8(pg, input2_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2); - - const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); - const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); - const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); - - svst1_u8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_u8(pg, input1_ptr + x); + const auto b = svld1_u8(pg, input2_ptr + x); + const auto af_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), + vscale1); + const auto af_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), + vscale1); + const auto af_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), + vscale1); + + const auto bf_0 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, + svcvt_f32_s32_z(pg, + svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), + vscale2); + + const auto rf_0 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp index dae8899753..2e585115e1 100644 --- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include <arm_sve.h> namespace arm_compute { namespace cpu { -void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qasymm8_signed_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +60,7 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto voffseto = svdup_n_f32(oq_info.offset); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -78,46 +81,63 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s8(broadcast_value); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); - - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), + vscale2); + + do + { + const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -134,46 +154,59 @@ void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor * const auto voffset1 = svdup_n_s32(iq1_info.offset); const auto voffset2 = svdup_n_s32(iq2_info.offset); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = svld1_s8(pg, input1_ptr + x); - const auto b = svld1_s8(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); - const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); - const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); - const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); - const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); - const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); - - const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); - const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); - - svst1_s8(pg, output_ptr + x, res); - - x += svcntb(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_s8(pg, input1_ptr + x); + const auto b = svld1_s8(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto bf_0 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z( + pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); + + const auto rf_0 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = + svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp index 8c48ded942..17a42c2138 100644 --- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp +++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp @@ -26,15 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/SVEMath.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include <arm_sve.h> namespace arm_compute { namespace cpu { -void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void add_qsymm16_sve2( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -59,7 +62,7 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); const auto all_true_pg = svptrue_b16(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -74,39 +77,40 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); - const auto broadcast_value_vec = svdup_n_s16(broadcast_value); + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s16(broadcast_value); - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); - do - { - const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + do + { + const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - svst1_s16(pg, output_ptr + x, res); + svst1_s16(pg, output_ptr + x, res); - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -118,37 +122,38 @@ void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - auto a = svld1_s16(pg, input1_ptr + x); - auto b = svld1_s16(pg, input2_ptr + x); - - const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); - const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); - - const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); - const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); - - const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); - const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); - - const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); - svst1_s16(pg, output_ptr + x, res); - - x += svcnth(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + auto a = svld1_s16(pg, input1_ptr + x); + auto b = svld1_s16(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h index 7cdb70fd9e..1040c39a41 100644 --- a/src/cpu/kernels/add/list.h +++ b/src/cpu/kernels/add/list.h @@ -31,8 +31,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ADD_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +#define DECLARE_ADD_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) DECLARE_ADD_KERNEL(add_qasymm8_neon); DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); @@ -55,4 +56,4 @@ DECLARE_ADD_KERNEL(add_qsymm16_sve2); } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_KERNELS_ADD_LIST_H
\ No newline at end of file +#endif // SRC_CORE_KERNELS_ADD_LIST_H diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp index d8e5f694a8..b4b81aa78b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/CpuTypes.h" #include <cstddef> @@ -38,16 +39,20 @@ namespace { using arm_compute::float16_t; -void a64_add_bn_clamp_direct_fp16_2x32( - float16_t *out, size_t out_stride, - float16_t *out_direct, size_t out_direct_stride, - const float16_t *in0, size_t in0_stride, - const float16_t *in1, size_t in1_stride, - const float16_t *bn_mul, - const float16_t *bn_add, - const float16_t minval, - const float16_t maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp16_2x32(float16_t *out, + size_t out_stride, + float16_t *out_direct, + size_t out_direct_stride, + const float16_t *in0, + size_t in0_stride, + const float16_t *in1, + size_t in1_stride, + const float16_t *bn_mul, + const float16_t *bn_add, + const float16_t minval, + const float16_t maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -858,9 +863,14 @@ void a64_add_bn_clamp_direct_fp16_2x32( "subs x20, x20, #0x2\n" "bgt 8b\n" "58:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -869,8 +879,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp16_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -882,16 +899,16 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I float16_t minval = std::numeric_limits<half>::lowest(); float16_t maxval = std::numeric_limits<half>::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = static_cast<float16_t>(0.f); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = static_cast<float16_t>(0.f); maxval = static_cast<float16_t>(act_info.a()); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = static_cast<float16_t>(act_info.b()); maxval = static_cast<float16_t>(act_info.a()); @@ -909,42 +926,37 @@ void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, - reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float16_t *>(bn_mul->buffer()), - reinterpret_cast<float16_t *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, + reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp16_2x32( - reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float16_t *>(bn_mul->buffer()), - reinterpret_cast<float16_t *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr, + out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()), + in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride, + reinterpret_cast<float16_t *>(bn_mul->buffer()), + reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval, + width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp index b0c487ec56..f0444b6acd 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp @@ -35,16 +35,20 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_fp32_2x16( - float *out, size_t out_stride, - float *out_direct, size_t out_direct_stride, - const float *in0, size_t in0_stride, - const float *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const float minval, - const float maxval, - size_t width, size_t height) +void a64_add_bn_clamp_direct_fp32_2x16(float *out, + size_t out_stride, + float *out_direct, + size_t out_direct_stride, + const float *in0, + size_t in0_stride, + const float *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const float minval, + const float maxval, + size_t width, + size_t height) { struct KernelArgs { @@ -631,18 +635,30 @@ void a64_add_bn_clamp_direct_fp32_2x16( "subs x20, x20, #0x2\n" "bgt 8b\n" "34:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); -} + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } +} // namespace namespace arm_compute { namespace cpu { -void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_fp32_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -654,16 +670,16 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I float minval = std::numeric_limits<float>::lowest(); float maxval = std::numeric_limits<float>::max(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = 0.f; } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = 0.f; maxval = act_info.a(); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = act_info.b(); maxval = act_info.a(); @@ -681,42 +697,34 @@ void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const I const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast<float *>(out_it.ptr()), out_stride, - reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<float *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float *>(bn_mul->buffer()), - reinterpret_cast<float *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride, + reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_fp32_2x16( - reinterpret_cast<float *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<float *>(in1_it.ptr()), in0_stride, - reinterpret_cast<float *>(in2_it.ptr()), in1_stride, - reinterpret_cast<float *>(bn_mul->buffer()), - reinterpret_cast<float *>(bn_add->buffer()), - minval, - maxval, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_fp32_2x16( + reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()), + in1_stride, reinterpret_cast<float *>(bn_mul->buffer()), + reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp index f7448a6717..035805c944 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_u8_fp32_2x16( - uint8_t *out, size_t out_stride, - uint8_t *out_direct, size_t out_direct_stride, - const uint8_t *in0, size_t in0_stride, - const uint8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const uint8_t minval, - const uint8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t *out, + size_t out_stride, + uint8_t *out_direct, + size_t out_direct_stride, + const uint8_t *in0, + size_t in0_stride, + const uint8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const uint8_t minval, + const uint8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_u8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_u8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe uint8_t maxval = std::numeric_limits<uint8_t>::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8(0.f, final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, - reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, + reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_u8_fp32_2x16( - reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_u8_fp32_2x16( + reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp index 1ae2cb76a9..e1a45b467b 100644 --- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp @@ -36,22 +36,30 @@ #ifdef __aarch64__ namespace { -void a64_add_bn_clamp_direct_s8_fp32_2x16( - int8_t *out, size_t out_stride, - int8_t *out_direct, size_t out_direct_stride, - const int8_t *in0, size_t in0_stride, - const int8_t *in1, size_t in1_stride, - const float *bn_mul, - const float *bn_add, - const int8_t minval, - const int8_t maxval, - int32_t out_zeropt, float out_scale, - int32_t out_direct_zeropt, float out_direct_scale, - int32_t in0_zeropt, float in0_scale, - int32_t in1_zeropt, float in1_scale, - size_t width, size_t height) +void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t *out, + size_t out_stride, + int8_t *out_direct, + size_t out_direct_stride, + const int8_t *in0, + size_t in0_stride, + const int8_t *in1, + size_t in1_stride, + const float *bn_mul, + const float *bn_add, + const int8_t minval, + const int8_t maxval, + int32_t out_zeropt, + float out_scale, + int32_t out_direct_zeropt, + float out_direct_scale, + int32_t in0_zeropt, + float in0_scale, + int32_t in1_zeropt, + float in1_scale, + size_t width, + size_t height) { - float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale }; + float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale}; struct KernelArgs { const float *scales; @@ -709,9 +717,19 @@ void a64_add_bn_clamp_direct_s8_fp32_2x16( "subs x23, x23, #0x2\n" "bgt 6b\n" "32:" // odd columns skip - : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width) - : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); + : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), + [out_direct] "+&r"(out_direct), [width] "+&r"(width) + : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), + [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), + [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), + [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), + [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), + [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), + [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), + [out_stride] "r"(out_stride) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", + "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"); } } // namespace @@ -720,8 +738,15 @@ namespace arm_compute { namespace cpu { -void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) +void add_mul_add_s8_neon(const ITensor *input1, + const ITensor *input2, + const ITensor *bn_mul, + const ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info, + const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -739,24 +764,25 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe int8_t maxval = std::numeric_limits<int8_t>::max(); const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform(); - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) { minval = quantize_qasymm8_signed(0.f, final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) { minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo); maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo); } - const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); - const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); - const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); + const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform(); + const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform(); + const UniformQuantizationInfo add_output_qinfo = + (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo(); const int32_t in1_offset = in1_qinfo.offset; const int32_t in2_offset = in2_qinfo.offset; @@ -783,50 +809,35 @@ void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITe const size_t width = window.num_iterations(0); const size_t height = window.num_iterations(1); - if(add_output != nullptr) + if (add_output != nullptr) { Iterator add_out_it(add_output, window); execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, - reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride, - reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, add_out_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()), + out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, + reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, + out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset, + in2_scale, width, height); + }, + in1_it, in2_it, add_out_it, out_it); } else { execute_window_loop( - win, [&](const Coordinates &) - { - a64_add_bn_clamp_direct_s8_fp32_2x16( - reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, - nullptr, out_direct_stride, - reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, - reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, - bn_mul_buffer, - bn_add_buffer, - minval, - maxval, - out_offset, out_scale, - out_direct_offset, out_direct_scale, - in1_offset, in1_scale, - in2_offset, in2_scale, - width, height); - }, - in1_it, in2_it, out_it); + win, + [&](const Coordinates &) + { + a64_add_bn_clamp_direct_s8_fp32_2x16( + reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride, + reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()), + in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset, + out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height); + }, + in1_it, in2_it, out_it); } } } // namespace cpu diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h index a7c22c06d8..568003a916 100644 --- a/src/cpu/kernels/addmuladd/list.h +++ b/src/cpu/kernels/addmuladd/list.h @@ -32,9 +32,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ +#define DECLARE_ADD_MUL_ADD_KERNEL(func_name) \ void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \ - ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window) + ITensor *add_output, ITensor *final_output, ConvertPolicy policy, \ + const ActivationLayerInfo &act_info, const Window &window) DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon); DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon); diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h index 10bf8e4ff7..6e8f32ef47 100644 --- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/NEON/INEKernel.h" #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" @@ -57,13 +58,12 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel public: /** Constructor */ - CpuGemmAssemblyWrapperKernel() - : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") + CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") { } - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; - CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; const char *name() const override @@ -110,7 +110,7 @@ public: INEKernel::configure(win); - if(!kernel_name_tag.empty()) + if (!kernel_name_tag.empty()) { _name += "/" + kernel_name_tag; } @@ -132,7 +132,7 @@ public: private: arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel; - std::string _name; + std::string _name; }; } // namespace kernel } // namespace cpu diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp index 4c127b4ec3..9a913c5c58 100644 --- a/src/cpu/kernels/assembly/arm_gemm.hpp +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -23,13 +23,12 @@ */ #pragma once +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" #include <cstring> #include <memory> #include <vector> -#include "arm_gemm_local.hpp" -#include "gemm_common.hpp" - namespace arm_gemm { enum class GemmMethod @@ -111,8 +110,7 @@ struct GemmConfig unsigned int outer_block_size = 0; WeightFormat weight_format = WeightFormat::ANY; - GemmConfig(GemmMethod method) - : method(method) + GemmConfig(GemmMethod method) : method(method) { } GemmConfig() @@ -133,8 +131,7 @@ struct Activation float param1; float param2; - Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) - : type(type), param1(p1), param2(p2) + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2) { } }; @@ -156,12 +153,32 @@ public: bool _fast_mode; const GemmConfig *_cfg; - GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, - unsigned int K, unsigned int Ksections, unsigned int nbatches, - unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, - bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr) - : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), - _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg) + GemmArgs(const CPUInfo *ci, + unsigned int M, + unsigned int N, + unsigned int K, + unsigned int Ksections, + unsigned int nbatches, + unsigned int nmulti, + bool indirect_input, + Activation act, + const int maxthreads, + bool fixed_format = false, + bool fast_mode = false, + const GemmConfig *cfg = nullptr) + : _ci(ci), + _Msize(M), + _Nsize(N), + _Ksize(K), + _Ksections(Ksections), + _nbatches(nbatches), + _nmulti(nmulti), + _indirect_input(indirect_input), + _act(act), + _maxthreads(maxthreads), + _fixed_format(fixed_format), + _fast_mode(fast_mode), + _cfg(cfg) { } }; @@ -187,23 +204,51 @@ public: Requantize32() = default; // Constructor for per-tensor quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, - int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), - per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, + int32_t requant_shift, + int32_t requant_mul, + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(false), + per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), + per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), + per_layer_mul(requant_mul), + minval(minv), + maxval(maxv) { } // Constructor for per-channel quantization - Requantize32(const int32_t *bias, size_t bias_multi_stride, - int32_t a_offset, int32_t b_offset, int32_t c_offset, + Requantize32(const int32_t *bias, + size_t bias_multi_stride, + int32_t a_offset, + int32_t b_offset, + int32_t c_offset, const int32_t *requant_left_shifts, const int32_t *requant_right_shifts, const int32_t *requant_muls, - int32_t minv, int32_t maxv) - : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts), - per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv) + int32_t minv, + int32_t maxv) + : bias(bias), + bias_multi_stride(bias_multi_stride), + a_offset(a_offset), + b_offset(b_offset), + c_offset(c_offset), + per_channel_requant(true), + per_channel_left_shifts(requant_left_shifts), + per_channel_right_shifts(requant_right_shifts), + per_channel_muls(requant_muls), + minval(minv), + maxval(maxv) { } }; diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp index 718fcd1fb4..0672e899b6 100644 --- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp +++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp @@ -27,7 +27,6 @@ #include "arm_compute/core/Window.h" #include "ndrange.hpp" - #include <cassert> /* This file contains mapping between integral types used in arm_compute and arm_gemm @@ -38,8 +37,7 @@ namespace arm_gemm { //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library -constexpr std::size_t ndrange_max = - arm_compute::Dimensions<unsigned int>::num_max_dimensions; +constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions; using ndrange_t = NDRange<ndrange_max>; using ndcoord_t = NDCoordinate<ndrange_max>; @@ -56,7 +54,7 @@ inline arm_compute::Window to_window(const ndrange_t &ndr) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { //populate the window with the dimensions of the NDRange win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); @@ -75,7 +73,7 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) { arm_compute::Window win; - for(unsigned int i = 0; i != ndrange_max; ++i) + for (unsigned int i = 0; i != ndrange_max; ++i) { const auto start = ndc.get_position(i); const auto size = ndc.get_size(i); @@ -98,15 +96,12 @@ inline arm_compute::Window to_window(const ndcoord_t &ndc) */ inline ndrange_t to_ndrange(const arm_compute::Window &win) { - return - { - static_cast<unsigned int>(win[0].end() - win[0].start()), - static_cast<unsigned int>(win[1].end() - win[1].start()), - static_cast<unsigned int>(win[2].end() - win[2].start()), - static_cast<unsigned int>(win[3].end() - win[3].start()), - static_cast<unsigned int>(win[4].end() - win[4].start()), - static_cast<unsigned int>(win[5].end() - win[5].start()) - }; + return {static_cast<unsigned int>(win[0].end() - win[0].start()), + static_cast<unsigned int>(win[1].end() - win[1].start()), + static_cast<unsigned int>(win[2].end() - win[2].start()), + static_cast<unsigned int>(win[3].end() - win[3].start()), + static_cast<unsigned int>(win[4].end() - win[4].start()), + static_cast<unsigned int>(win[5].end() - win[5].start())}; } /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions @@ -116,15 +111,12 @@ inline ndrange_t to_ndrange(const arm_compute::Window &win) */ inline ndcoord_t to_ndcoord(const arm_compute::Window &win) { - return - { - { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) }, - { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) }, - { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) }, - { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) }, - { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) }, - { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) } - }; + return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())}, + {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())}, + {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())}, + {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())}, + {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())}, + {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}}; } } //namespace arm_gemm diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp index 834cd1061e..6fe9f13f02 100644 --- a/src/cpu/kernels/assembly/gemm_common.hpp +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -25,7 +25,6 @@ #include "convolution_parameters.hpp" #include "ndrange.hpp" - #include <cstddef> namespace arm_gemm @@ -51,10 +50,19 @@ public: * appropriately typed pointers. If B is pretransposed (see below) then * the settings for B here are ignored. */ - virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + virtual void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) = 0; /** @returns an ndrange containing ranges of the compute space which can be * broken up and parallelised over @@ -73,7 +81,7 @@ public: * This has an empty default implementation, as GEMMs which don't care * about thread count can safely ignore this. */ - virtual void set_nthreads(int) {}; + virtual void set_nthreads(int){}; /* Whether this GEMM can be dynamically scheduled or not. */ virtual bool supports_dynamic_scheduling() const @@ -95,7 +103,7 @@ public: return 0; } /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ - virtual void set_working_space(void *) {}; + virtual void set_working_space(void *){}; /*** "Pretransposed" interface (optional) ***/ /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ @@ -122,7 +130,8 @@ public: /* The "real" version of this depends on the templated operand type (see below). */ virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; /* Threaded version with window start/end parameters */ - virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; + virtual void + pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0; /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ virtual void set_pretransposed_B_data(void *) @@ -186,10 +195,19 @@ protected: public: /* Pass in the pointers to the arrays to be operated on and their * strides (templated version with appropriate types). */ - virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const To *B, const int ldb, /* batches share B */ const int B_multi_stride, - Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + virtual void set_arrays(const To *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const To *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + Tr *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const Tr *bias, + /* no row or batch stride needed */ const int bias_multi_stride) { _Aptr = A; _lda = lda; @@ -207,25 +225,33 @@ public: } /* Implementation of the void * overload which casts its arguments to the appropriate type. */ - void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, - const void *B, const int ldb, /* batches share B */ const int B_multi_stride, - void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, - const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + void set_arrays_generic(const void *A, + const int lda, + const int A_batch_stride, + const int A_multi_stride, + const void *B, + const int ldb, + /* batches share B */ const int B_multi_stride, + void *C, + const int ldc, + const int C_batch_stride, + const int C_multi_stride, + const void *bias, + /* no row or batch stride needed */ const int bias_multi_stride) override { - set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, - static_cast<const To *>(B), ldb, B_multi_stride, - static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, + set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb, + B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, static_cast<const Tr *>(bias), bias_multi_stride); } /*** "Pretransposed" interface ***/ /* Compute col sums over all columns */ - virtual void requantize_bias(void *, const To *, const int, const int) {}; + virtual void requantize_bias(void *, const To *, const int, const int){}; /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ - virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; + virtual void pretranspose_B_array(void *, const To *, const int, const int){}; /* Implementation of the void * overload which casts its arguments to the appropriate type. */ void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override @@ -237,12 +263,14 @@ public: * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and * just calls the non-threaded functions to do the work. This is valid as with window size of 1 the only * legal values for start and end are 0 and 1 respectively. */ - virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) + virtual void + pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t) { pretranspose_B_array(out, in, row_stride, multi_stride); }; - void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override + void pretranspose_B_array_part_generic( + void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override { pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end); } diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp index 1c8261aef7..baccdc0d88 100644 --- a/src/cpu/kernels/assembly/ndrange.hpp +++ b/src/cpu/kernels/assembly/ndrange.hpp @@ -45,8 +45,7 @@ private: unsigned int m_end = 0; public: - NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) - : m_parent(p), m_pos(s), m_end(e) + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { } @@ -59,12 +58,12 @@ private: { unsigned int r = m_pos; - if(d < (D - 1)) + if (d < (D - 1)) { r %= m_parent.m_totalsizes[d]; } - if(d > 0) + if (d > 0) { r /= m_parent.m_totalsizes[d - 1]; } @@ -98,9 +97,9 @@ private: { unsigned int t = 1; - for(unsigned int i = 0; i < D; i++) + for (unsigned int i = 0; i < D; i++) { - if(m_sizes[i] == 0) + if (m_sizes[i] == 0) { m_sizes[i] = 1; } @@ -116,14 +115,12 @@ public: NDRange(const NDRange &rhs) = default; template <typename... T> - NDRange(T... ts) - : m_sizes{ ts... } + NDRange(T... ts) : m_sizes{ts...} { set_totalsizes(); } - NDRange(const std::array<unsigned int, D> &n) - : m_sizes(n) + NDRange(const std::array<unsigned int, D> &n) : m_sizes(n) { set_totalsizes(); } @@ -163,7 +160,7 @@ public: std::array<int_t, N> sizes{}; std::size_t i = 0; - for(auto &p : list) + for (auto &p : list) { m_positions[i] = p.first; sizes[i++] = p.second; diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp index 5661479059..dbdec5fb50 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp @@ -29,7 +29,11 @@ namespace arm_compute { namespace cpu { -void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_fp16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp index 34ff9224d5..0224b3406a 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp @@ -26,7 +26,11 @@ namespace arm_compute { namespace cpu { -void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_fp32_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp index b3ffd0a676..5a2939b587 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp @@ -29,7 +29,11 @@ namespace arm_compute { namespace cpu { -void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; @@ -41,7 +45,8 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c const auto scale_before = bbinfo.scale(); const auto offset = (bbinfo.correct_transform_coords() ? 1.f : 0.f); - auto pred_ptr = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); + auto pred_ptr = + reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes()); auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); const auto boxes_qinfo = boxes->info()->quantization_info().uniform(); @@ -49,41 +54,49 @@ void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, c const auto pred_qinfo = pred_boxes->info()->quantization_info().uniform(); Iterator box_it(boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr()); - const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); - const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); - const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); - const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); - const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; - const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; - const float ctr_x = (b0 / scale_before) + 0.5f * width; - const float ctr_y = (b1 / scale_before) + 0.5f * height; - for(size_t j = 0; j < num_classes; ++j) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0]; - const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1]; - float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2]; - float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3]; - // Clip dw and dh - dw = std::min(dw, bbinfo.bbox_xform_clip()); - dh = std::min(dh, bbinfo.bbox_xform_clip()); - // Determine the predictions - const float pred_ctr_x = dx * width + ctr_x; - const float pred_ctr_y = dy * height + ctr_y; - const float pred_w = std::exp(dw) * width; - const float pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); - pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo); - pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo); - } - }, - box_it); + const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr()); + const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo); + const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo); + const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo); + const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo); + const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f; + const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f; + const float ctr_x = (b0 / scale_before) + 0.5f * width; + const float ctr_y = (b1 / scale_before) + 0.5f * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0]; + const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1]; + float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2]; + float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3]; + // Clip dw and dh + dw = std::min(dw, bbinfo.bbox_xform_clip()); + dh = std::min(dh, bbinfo.bbox_xform_clip()); + // Determine the predictions + const float pred_ctr_x = dx * width + ctr_x; + const float pred_ctr_y = dy * height + ctr_y; + const float pred_w = std::exp(dw) * width; + const float pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo); + pred_ptr[delta_id + 1] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo); + pred_ptr[delta_id + 2] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), + pred_qinfo); + pred_ptr[delta_id + 3] = quantize_qasymm16( + scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), + pred_qinfo); + } + }, + box_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h index 7f990396df..d8013c6227 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h @@ -30,7 +30,11 @@ namespace arm_compute namespace cpu { template <typename T> -void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void bounding_box_transform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2; const size_t deltas_width = deltas->info()->tensor_shape()[0]; @@ -46,44 +50,53 @@ void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITe auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes()); Iterator box_it(boxes, window); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto ptr = reinterpret_cast<T *>(box_it.ptr()); - const auto b0 = *ptr; - const auto b1 = *(ptr + 1); - const auto b2 = *(ptr + 2); - const auto b3 = *(ptr + 3); - const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); - const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); - const T ctr_x = (b0 / scale_before) + T(0.5f) * width; - const T ctr_y = (b1 / scale_before) + T(0.5f) * height; - for(size_t j = 0; j < num_classes; ++j) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Extract deltas - const size_t delta_id = id.y() * deltas_width + 4u * j; - const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]); - const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]); - T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]); - T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]); - // Clip dw and dh - dw = std::min(dw, T(bbinfo.bbox_xform_clip())); - dh = std::min(dh, T(bbinfo.bbox_xform_clip())); - // Determine the predictions - const T pred_ctr_x = dx * width + ctr_x; - const T pred_ctr_y = dy * height + ctr_y; - const T pred_w = std::exp(dw) * width; - const T pred_h = std::exp(dh) * height; - // Store the prediction into the output tensor - pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); - pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); - pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); - pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); - } - }, - box_it); + const auto ptr = reinterpret_cast<T *>(box_it.ptr()); + const auto b0 = *ptr; + const auto b1 = *(ptr + 1); + const auto b2 = *(ptr + 2); + const auto b3 = *(ptr + 3); + const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f); + const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f); + const T ctr_x = (b0 / scale_before) + T(0.5f) * width; + const T ctr_y = (b1 / scale_before) + T(0.5f) * height; + for (size_t j = 0; j < num_classes; ++j) + { + // Extract deltas + const size_t delta_id = id.y() * deltas_width + 4u * j; + const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]); + const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]); + T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]); + T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]); + // Clip dw and dh + dw = std::min(dw, T(bbinfo.bbox_xform_clip())); + dh = std::min(dh, T(bbinfo.bbox_xform_clip())); + // Determine the predictions + const T pred_ctr_x = dx * width + ctr_x; + const T pred_ctr_y = dy * height + ctr_y; + const T pred_w = std::exp(dw) * width; + const T pred_h = std::exp(dh) * height; + // Store the prediction into the output tensor + pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1)); + pred_ptr[delta_id + 1] = + scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1)); + pred_ptr[delta_id + 2] = + scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1)); + pred_ptr[delta_id + 3] = + scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1)); + } + }, + box_it); } -void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window); +void bounding_box_transform_qsymm16(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp index b27c187df3..64ef815195 100644 --- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp @@ -26,7 +26,11 @@ namespace arm_compute { namespace cpu { -void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +void neon_qu16_boundingboxtransform(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + BoundingBoxTransformInfo bbinfo, + const Window &window) { return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window); } diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h index 8f06acc8a6..4da725a257 100644 --- a/src/cpu/kernels/boundingboxtransform/list.h +++ b/src/cpu/kernels/boundingboxtransform/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \ - void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window) +#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \ + void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \ + const Window &window) DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform); DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform); DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform); diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp index 6cd0c8500b..2897f4b242 100644 --- a/src/cpu/kernels/cast/generic/neon/fp16.cpp +++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CPP/CPPTypes.h" #include "arm_compute/core/TensorInfo.h" -#include "src/cpu/kernels/CpuCastKernel.h" + #include "src/cpu/kernels/cast/list.h" +#include "src/cpu/kernels/CpuCastKernel.h" #include "support/SaturateCast.h" #include "arm_neon.h" @@ -35,7 +36,8 @@ namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_qasymm8_signed_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -49,42 +51,39 @@ void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - int x = window_start_x; - - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + int x = window_start_x; - const int16x8x2_t texels = + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vmovl_s8(vget_low_s8(texels_s8)), - vmovl_s8(vget_high_s8(texels_s8)) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_s32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -98,44 +97,41 @@ void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vcvtq_f32_s32(vld1q_s32(src_ptr + x)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), - vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12)) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = { + {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp32_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -149,44 +145,40 @@ void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float32x4x4_t texels = + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f32(src_ptr + x), - vld1q_f32(src_ptr + x + 4), - vld1q_f32(src_ptr + x + 8), - vld1q_f32(src_ptr + x + 12) - } - }; - - vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); - vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); - } + const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}}; - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); } -void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_fp16_to_other_dt_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -200,142 +192,133 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); - switch(_dst->info()->data_type()) + switch (_dst->info()->data_type()) { case DataType::QASYMM8_SIGNED: { /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); - } + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), + vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); - } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::QASYMM8: case DataType::U8: { /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const float16x8x2_t texels = {{ vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8), - } - }; + }}; - vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); - } + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), + vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); + } - }, - src, dst); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::F32: { /* Up-conversion F16 -> F32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); - vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); - vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); + vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); break; } case DataType::S32: { /* Up-conversion F16 -> S32 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float16x8x2_t texels = + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vld1q_f16(src_ptr + x), - vld1q_f16(src_ptr + x + 8) - } - }; - - vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); - vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); - vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); - } - }, - src, dst); + const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}}; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); break; } default: @@ -343,7 +326,8 @@ void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const Thread } } -void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) +void neon_u8_to_fp16_cast( + const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_UNUSED(_policy); @@ -357,40 +341,37 @@ void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo & ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator src(_src, win); Iterator dst(_dst, win); /* Up-conversion U8 -> F16 */ - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); - const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); - const int16x8x2_t texels = + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { - vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), - vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) - } - }; - vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); - vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); - } + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); - } - }, - src, dst); + const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}}; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); return; } diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h index ffd82d5bf3..5e634fc170 100644 --- a/src/cpu/kernels/cast/list.h +++ b/src/cpu/kernels/cast/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_CAST_KERNEL(func_name) \ - void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window) +#define DECLARE_CAST_KERNEL(func_name) \ + void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \ + const Window &window) DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast); DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast); @@ -41,4 +42,4 @@ DECLARE_CAST_KERNEL(neon_bfloat16_to_fp32_cast); #undef DECLARE_CAST_KERNEL } // namespace cpu } // namespace arm_compute -#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
\ No newline at end of file +#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h index 3bfa124dc3..082c60be29 100644 --- a/src/cpu/kernels/conv3d/neon/list.h +++ b/src/cpu/kernels/conv3d/neon/list.h @@ -27,8 +27,9 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/runtime/FunctionDescriptors.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/conv3d/neon/quantized.h" namespace arm_compute @@ -36,7 +37,12 @@ namespace arm_compute namespace cpu { template <typename T> -void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window) +void directconv3d_float_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) { const ITensor *src = src0; const ITensor *weights = src1; @@ -88,91 +94,104 @@ void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, con Iterator wei(weights, window_w); const T *biases_ptr = nullptr; - if(biases != nullptr) + if (biases != nullptr) { biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical input starting points - const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; - const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - const int in_d_end_t = in_d_start_t + kernel_dim_d; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_d_start = std::max(in_d_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - const int in_d_end = std::min(in_d_end_t, input_dim_d); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_d_start = in_d_start - in_d_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); - - const int index_c_out_end = weights->info()->dimension(0); - const int index_c_in_end = weights->info()->dimension(1); - const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n; - - execute_window_loop(window_w, [&](const Coordinates & id_w) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - /* + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along OFM (output feature map) */ - const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); - T out_temp = static_cast<T>(0); - T *out_ptr = reinterpret_cast<T *>(out.ptr()); - for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d) - { - const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; - const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + T out_temp = static_cast<T>(0); + T *out_ptr = reinterpret_cast<T *>(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c_in = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration; - index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - //Load Cin weights - for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end) + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) { - w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } } - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; } } - } - } - *(reinterpret_cast<T *>(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp; + *(reinterpret_cast<T *>(out_ptr + id_w[0])) = + (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp; + }, + wei); }, - wei); - }, - out); + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
\ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h index a8165b4944..f0fc9b5a71 100644 --- a/src/cpu/kernels/conv3d/neon/quantized.h +++ b/src/cpu/kernels/conv3d/neon/quantized.h @@ -28,16 +28,22 @@ #include "arm_compute/core/utils/misc/Traits.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/WindowHelpers.h" namespace arm_compute { namespace cpu { template <typename T> -void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window) +void directconv3d_quantized_neon_ndhwc(const ITensor *src0, + const ITensor *src1, + const ITensor *src2, + ITensor *dst, + const Conv3dInfo &conv_info, + const Window &window) { const ITensor *src = src0; const ITensor *weights = src1; @@ -104,153 +110,166 @@ void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, Iterator wei(weights, window_w); const int32_t *biases_ptr = nullptr; - if(biases != nullptr) + if (biases != nullptr) { biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()); } - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical input starting points - const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; - const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - const int in_d_end_t = in_d_start_t + kernel_dim_d; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + const int in_d_end_t = in_d_start_t + kernel_dim_d; - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_d_start = std::max(in_d_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - const int in_d_end = std::min(in_d_end_t, input_dim_d); + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_d_start = std::max(in_d_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + const int in_d_end = std::min(in_d_end_t, input_dim_d); - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_d_start = in_d_start - in_d_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_d_start = in_d_start - in_d_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + const int wei_d_end = kernel_dim_d - (in_d_end_t - in_d_end); - const int index_c_out_end = weights->info()->dimension(0); - const int index_c_in_end = weights->info()->dimension(1); - const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n; + const int index_c_out_end = weights->info()->dimension(0); + const int index_c_in_end = weights->info()->dimension(1); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[4] * input_stride_n; - execute_window_loop(window_w, [&](const Coordinates & id_w) - { - /* + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along OFM (output feature map) */ - const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); - int32_t acc = static_cast<int32_t>(0); - T *out_ptr = reinterpret_cast<T *>(out.ptr()); - for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d) - { - const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; - const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + int32_t acc = static_cast<int32_t>(0); + T *out_ptr = reinterpret_cast<T *>(out.ptr()); + for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; + ++index_wei_d, ++index_in_d) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c_in = 0; - vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - - q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); - q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); - q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); - q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); - - for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration; - index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + const auto in_ptr_d = in_ptr_start + index_in_d * input_stride_d; + const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - //Load Cin weights - for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end) + const T *const in_ptr_row = in_ptr_d + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) { - w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); - } - q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); - q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); - q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); - q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c_in = 0; + vector_type w_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); - q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); - q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); - q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); + q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type()); - const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec)); - const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec)); - const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec)); - const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec)); + for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration; + index_c_in += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + //Load Cin weights + for (int k = 0; k < num_elems_read_per_iteration; + ++k, weights_ptr_mover += index_c_out_end) + { + w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k); + } + q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); + q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type()); - src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0))); - src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0))); - src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1))); - src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1))); + q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); + q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type()); - wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0))); - wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0))); - wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1))); - wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1))); + const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec)); + const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec)); + const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec)); + const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec)); - acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0); - acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1); - acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2); - acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3); - } + src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0))); + src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0))); + src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1))); + src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1))); + + wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0))); + wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0))); + wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1))); + wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1))); + + acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0); + acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1); + acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2); + acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3); + } #if defined(__aarch64__) - acc += wrapper::vaddv(acc_q32_0); - acc += wrapper::vaddv(acc_q32_1); - acc += wrapper::vaddv(acc_q32_2); - acc += wrapper::vaddv(acc_q32_3); + acc += wrapper::vaddv(acc_q32_0); + acc += wrapper::vaddv(acc_q32_1); + acc += wrapper::vaddv(acc_q32_2); + acc += wrapper::vaddv(acc_q32_3); #else // __aarch64__ - auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); - temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3)); - temp = wrapper::vpadd(temp, temp); - acc += wrapper::vgetlane(temp, 0); + temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3)); + temp = wrapper::vpadd(temp, temp); + acc += wrapper::vgetlane(temp, 0); #endif // __aarch64__ - for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) - { - const auto src_val = *(in_ptr_mover) + input_offset; - const auto w_val = *(weights_ptr_mover) + weights_offset; - acc += src_val * w_val; + for (; index_c_in < index_c_in_end; + ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end) + { + const auto src_val = *(in_ptr_mover) + input_offset; + const auto w_val = *(weights_ptr_mover) + weights_offset; + acc += src_val * w_val; + } + } } } - } - } - if(biases) - { - acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]); - } + if (biases) + { + acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]); + } - T out_val = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false); - *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val; + T out_val = + finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false); + *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val; + }, + wei); }, - wei); - }, - out); + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
\ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h index 1fe8e11e98..8fb7ad2087 100644 --- a/src/cpu/kernels/crop/generic/neon/crop_helper.h +++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h @@ -80,7 +80,7 @@ inline float32x4_t load_as_f32(uint8_t *ptr) { return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr))))); } -} +} // namespace cpu } // namespace arm_compute -#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
\ No newline at end of file +#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp index 218ebba191..3739c9d4e0 100644 --- a/src/cpu/kernels/crop/generic/neon/fp16.cpp +++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp @@ -29,12 +29,19 @@ namespace arm_compute { namespace cpu { -void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void fp16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp index 16d0218fce..f665c3652c 100644 --- a/src/cpu/kernels/crop/generic/neon/fp32.cpp +++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp @@ -28,11 +28,18 @@ namespace arm_compute { namespace cpu { -void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void fp32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h index a59588be45..b90ba9ddbf 100644 --- a/src/cpu/kernels/crop/generic/neon/impl.h +++ b/src/cpu/kernels/crop/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/crop/generic/neon/crop_helper.h" namespace arm_compute @@ -35,19 +36,26 @@ namespace arm_compute namespace cpu { template <typename T> -void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { // Reverse elements if width flipped. - if(is_width_flipped) + if (is_width_flipped) { // Collapse first dimension if possible. - if(input_has_single_channel) + if (input_has_single_channel) { int32_t x = output_width_start; Coordinates negative_offset(input_offset); negative_offset.set(1, negative_offset[1] - window_step_x + 1); - for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) + for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x) { auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset))); @@ -57,25 +65,27 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o wrapper::vstore(output_ptr + x, in); } input_offset[1] = negative_offset[1] + window_step_x - 1; - for(; x < output_width_limit; ++x, --input_offset[1]) + for (; x < output_width_limit; ++x, --input_offset[1]) { *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); } } else { - for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) + for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1]) { input_offset.set(0, 0); int32_t c = 0; - for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x) + for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; + c += window_step_x, input_offset[0] += window_step_x) { auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in); } - for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0]) + for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0]) { - *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + *(output_ptr + x * output->info()->dimension(0) + c) = + static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); } } } @@ -83,25 +93,28 @@ void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *o else { // Use memcpy if the elements don't need converting to float. - if(std::is_same<T, float>::value) + if (std::is_same<T, float>::value) { memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)), reinterpret_cast<const void *>(input->ptr_to_element(input_offset)), - (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size()); + (output_width_limit - output_width_start) * output->info()->dimension(0) * + output->info()->element_size()); } else { - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); + int32_t x = 0; + int32_t limit = + (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0)); float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) + for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x) { auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset))); wrapper::vstore(output_start_ptr + x, in); } - for(; x < limit; ++x, ++input_offset[0]) + for (; x < limit; ++x, ++input_offset[0]) { - *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); + *(output_start_ptr + x) = + static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset))); } } } diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp index ebf2c1fbd3..602434f54f 100644 --- a/src/cpu/kernels/crop/generic/neon/integer.cpp +++ b/src/cpu/kernels/crop/generic/neon/integer.cpp @@ -29,46 +29,88 @@ namespace arm_compute { namespace cpu { -void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void u32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s8_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s16_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); + return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } -void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) +void s32_in_bounds_crop_window(const ITensor *input, + const ITensor *output, + float *output_ptr, + Coordinates input_offset, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit, + bool input_has_single_channel, + bool is_width_flipped) { - return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, - window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped); -} + return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start, + output_width_limit, input_has_single_channel, is_width_flipped); } +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h index a6b83215ae..9cb7726203 100644 --- a/src/cpu/kernels/crop/list.h +++ b/src/cpu/kernels/crop/list.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/crop/generic/neon/impl.h" namespace arm_compute @@ -36,7 +37,8 @@ namespace cpu { #define DECLARE_CROP_KERNEL(func_name) \ void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \ - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped) + int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, \ + bool input_has_single_channel, bool is_width_flipped) DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window); DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window); diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp index e85a1664ea..293e606d81 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp @@ -29,11 +29,16 @@ namespace arm_compute { namespace cpu { -void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_fp16_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp index b2333a3334..c6fa4790b7 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp @@ -26,10 +26,15 @@ namespace arm_compute { namespace cpu { -void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_fp32_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp index a2ae5564e6..d08e973968 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h" + #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/function_info/ConvolutionInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -65,8 +67,16 @@ inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) namespace { template <typename T, typename TW> -void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_multiplier1_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT { ARM_COMPUTE_UNUSED(output_multiplier, output_shift); constexpr auto element_per_vector = vector_size / sizeof(T); @@ -75,7 +85,8 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei using AccType = int32_t; using AccArrayType = std::array<AccType, element_per_vector>; - const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + const auto out_of_bound_value = + PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{}); const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); @@ -104,152 +115,175 @@ void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *wei Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto const base_weights_ptr = weights_it.ptr(); - size_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) + execute_window_loop( + execution_window, + [&](const Coordinates &id) { - AccArrayType acc{}; - AccArrayType in_sum{}; - AccArrayType we_sum{}; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + auto const base_weights_ptr = weights_it.ptr(); + size_t x = run_info.x_start; - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) + for (; x < run_info.x_leftover_start; x += run_info.x_step) { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) - { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : - out_of_bound_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + AccArrayType acc{}; + AccArrayType in_sum{}; + AccArrayType we_sum{}; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; - for(size_t i = 0; i < element_per_vector; ++i) + for (size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) { - acc.at(i) += input_vals[i] * weights_vals[i]; - in_sum.at(i) += input_vals[i]; - we_sum.at(i) += weights_vals[i]; + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) + : out_of_bound_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) += input_vals[i] * weights_vals[i]; + in_sum.at(i) += input_vals[i]; + we_sum.at(i) += weights_vals[i]; + } + + offs += dilation.x() * run_info.input_stride_y; } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{}); + for (size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) -= in_sum.at(i) * weights_qoffset; + acc.at(i) -= we_sum.at(i) * input_qoffset; + acc.at(i) += k_offset; - VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{}); - for(size_t i = 0; i < element_per_vector; ++i) - { - acc.at(i) -= in_sum.at(i) * weights_qoffset; - acc.at(i) -= we_sum.at(i) * input_qoffset; - acc.at(i) += k_offset; + if (has_biases) + { + acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x); + } - if(has_biases) - { - acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x); + const int32_t out_mul = output_multiplier.at(x + i); + const int32_t out_shift = output_shift.at(x + i); + if (out_shift < 0) + { + acc.at(i) = + saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + + output_qoffset; + } + out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i))); } - const int32_t out_mul = output_multiplier.at(x + i); - const int32_t out_shift = output_shift.at(x + i); - if(out_shift < 0) - { - acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; - } - out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i))); + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals); } - wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals); - } - - // left-over - for(; x < run_info.x_end; ++x) - { - AccType acc = 0; - AccType in_sum = 0; - AccType we_sum = 0; + // left-over + for (; x < run_info.x_end; ++x) + { + AccType acc = 0; + AccType in_sum = 0; + AccType we_sum = 0; - auto weights_ptr = base_weights_ptr; - auto input_offset = base_input_offset; + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? - *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : - out_of_bound_value; - const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); - - acc += input_val * weights_val; - in_sum += input_val; - we_sum += weights_val; + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region + ? *reinterpret_cast<T *>(input_it.ptr() + + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) + : out_of_bound_value; + const auto weights_val = + *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc += input_val * weights_val; + in_sum += input_val; + we_sum += weights_val; + + offs += dilation.x() * run_info.input_stride_y; + } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + T out_vals{0}; - T out_vals{ 0 }; + acc -= in_sum * weights_qoffset; + acc -= we_sum * input_qoffset; + acc += k_offset; - acc -= in_sum * weights_qoffset; - acc -= we_sum * input_qoffset; - acc += k_offset; + if (has_biases) + { + acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x); + } - if(has_biases) - { - acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x); - } + const int32_t out_mul = output_multiplier.at(x); + const int32_t out_shift = output_shift.at(x); - const int32_t out_mul = output_multiplier.at(x); - const int32_t out_shift = output_shift.at(x); + if (out_shift < 0) + { + acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc = + rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + } - if(out_shift < 0) - { - acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else - { - acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + out_vals = static_cast<T>(utility::clamp<AccType, T>(acc)); + *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals; } - - out_vals = static_cast<T>(utility::clamp<AccType, T>(acc)); - *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals; - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T, typename TW> -void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_generic_quantized(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT { using AccType = int32_t; - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + const auto out_of_bound_value = + PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; @@ -277,76 +311,93 @@ void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector<AccType> acc(depth_multiplier, 0); - std::vector<AccType> we_sum(depth_multiplier, 0); - AccType in_sum = 0; + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector<AccType> acc(depth_multiplier, 0); + std::vector<AccType> we_sum(depth_multiplier, 0); + AccType in_sum = 0; - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value; - - for(size_t m = 0; m < depth_multiplier; ++m) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) += input_val * weights_val; - - we_sum.at(m) += weights_val; - } + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), + run_info.input_max_offset))) + : out_of_bound_value; - offs += dilation.x() * run_info.input_stride_y; - in_sum += input_val; - } + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) += input_val * weights_val; - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + we_sum.at(m) += weights_val; + } - for(size_t m = 0; m < depth_multiplier; ++m) - { - acc.at(m) -= in_sum * weights_qoffset; - acc.at(m) -= we_sum.at(m) * input_qoffset; - acc.at(m) += k_offset; + offs += dilation.x() * run_info.input_stride_y; + in_sum += input_val; + } - if(has_biases) - { - acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); - const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); - if(out_shift < 0) - { - acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; - } - else + for (size_t m = 0; m < depth_multiplier; ++m) { - acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; + acc.at(m) -= in_sum * weights_qoffset; + acc.at(m) -= we_sum.at(m) * input_qoffset; + acc.at(m) += k_offset; + + if (has_biases) + { + acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + } + + const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); + const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); + if (out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + + output_qoffset; + } + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = + static_cast<T>(utility::clamp<AccType, T>(acc.at(m))); } - *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m))); - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T, typename TW> -void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + std::vector<int> output_multiplier, + std::vector<int> output_shift, + const Window &window, + bool has_biases) // NOLINT { constexpr int half_vec = vector_size / 2; @@ -355,11 +406,15 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type; using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type; - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); - const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{}))); - const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{}))); - const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{}); + const auto input_qoffset_vec = wrapper::vreinterpret( + wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{}))); + const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl( + wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{}))); + const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, + arm_compute::wrapper::traits::vector_128_tag{}); const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{}); const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{}); @@ -389,7 +444,7 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } @@ -397,95 +452,117 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor std::vector<AccVectorType> acc0(depth_multiplier / vector_size); std::vector<AccVectorType> acc1(depth_multiplier / vector_size); - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::fill(begin(acc0), end(acc0), zero); - std::fill(begin(acc1), end(acc1), zero); + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::fill(begin(acc0), end(acc0), zero); + std::fill(begin(acc1), end(acc1), zero); - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - const int32_t current_h = input_z + h * dilation.y(); - if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height)) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + const int32_t current_h = input_z + h * dilation.y(); + if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height)) { - const int32_t current_w = input_y + w * dilation.x(); - if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width)) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{}); - const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); - const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); - - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + const int32_t current_w = input_y + w * dilation.x(); + if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width)) { - const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); - const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); - - acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs)); - acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs)); + const auto input_8x8 = wrapper::vdup_n( + *(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), + TagType{}); + const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); + const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); + + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>( + weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); + const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); + + acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), + wrapper::vgetlow(weights_no_offs)); + acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), + wrapper::vgethigh(weights_no_offs)); + } } - } - offs += dilation.x() * run_info.input_stride_y; + offs += dilation.x() * run_info.input_stride_y; + } } - } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } - for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) - { - if(has_biases) + for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) { - const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); - const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); + if (has_biases) + { + const auto bias_val0 = + wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + const auto bias_val1 = wrapper::vloadq( + reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); - acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); - acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); - } + acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); + acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); + } - if(out_shift < 0) - { - acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); - } - else - { - acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec); - acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec); - } + if (out_shift < 0) + { + acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), + output_qoffset_vec); + } + else + { + acc0.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), + output_qoffset_vec); + acc1.at(i) = wrapper::vadd( + rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), + output_qoffset_vec); + } - acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); - acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); + acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); + acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); - const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), - wrapper::vmovn(acc1.at(i))); + const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i))); - if(std::is_same<T, uint8_t>::value) - { - wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); - } - else - { - wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val)); + if (std::is_same<T, uint8_t>::value) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), + wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); + } + else + { + wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), + wrapper::vqmovn(out_val)); + } } - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } } // namespace template <typename T, typename TW> -void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { PadStrideInfo conv_info = info.pad_stride_info; unsigned int depth_multiplier = info.depth_multiplier; @@ -497,15 +574,15 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co const auto output_scale = dst->info()->quantization_info().uniform().scale; auto weights_scale = weights->info()->quantization_info().scale(); - if(!is_data_type_quantized_per_channel(weights->info()->data_type())) + if (!is_data_type_quantized_per_channel(weights->info()->data_type())) { - for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i) + for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i) { weights_scale.push_back(weights_scale.front()); } } - for(const auto &s : weights_scale) + for (const auto &s : weights_scale) { int32_t out_mult = 0; int32_t out_shift = 0; @@ -516,30 +593,49 @@ void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, co output_shift.push_back(out_shift); } - if(depth_multiplier == 1) + if (depth_multiplier == 1) { - depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases); + depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, + output_shift, window, has_biases); } else { const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0); const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type())); - if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8) + if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8) { - depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases); + depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, + depth_multiplier, output_multiplier, output_shift, window, + has_biases); } else { - depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases); + depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, + output_multiplier, output_shift, window, has_biases); } } } -template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); -template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); -template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); +template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h index 8410cdbf16..3fa5c58c3c 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h @@ -24,6 +24,7 @@ #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -63,15 +64,21 @@ struct DepthwiseConvolutionRunInfo const size_t input_width; const size_t input_depth; - DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT - : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), + DepthwiseConvolutionRunInfo(const ITensorInfo &input, + const ITensorInfo &weights, + const PadStrideInfo &conv_info, + const Window &w, + uint32_t depth_multiplier = 1) // NOLINT + : num_read_elements_per_iteration( + (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), x_start(w.x().start()), x_end(w.x().end()), x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)), x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))), input_stride_y(input.strides_in_bytes().y()), input_stride_z(input.strides_in_bytes().z()), - input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), + input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - + (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), weights_width(weights.dimension(width_idx)), weights_height(weights.dimension(height_idx)), weights_stride_y(weights.strides_in_bytes().y()), @@ -87,7 +94,12 @@ struct DepthwiseConvolutionRunInfo } }; -inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) +inline bool is_valid_input_region(int32_t base_w, + uint32_t base_h, + uint32_t w, + uint32_t h, + const DepthwiseConvolutionRunInfo &run_info, + const Size2D &dilation) { const int32_t current_h = base_h + h * dilation.y(); const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height); @@ -99,8 +111,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u } template <typename T> -void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, const Window &window, bool has_biases) +void depthwise_loop_multiplier1_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const Window &window, + bool has_biases) { constexpr auto element_per_vector = vector_size / sizeof(T); using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; @@ -129,94 +147,112 @@ void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, c Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - - auto const base_weights_ptr = weights_it.ptr(); - uint32_t x = run_info.x_start; - - for(; x < run_info.x_leftover_start; x += run_info.x_step) + execute_window_loop( + execution_window, + [&](const Coordinates &id) { - VectorType acc = zero_vector; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto const base_weights_ptr = weights_it.ptr(); + uint32_t x = run_info.x_start; - for(uint32_t h = 0; h < run_info.weights_height; ++h) + for (; x < run_info.x_leftover_start; x += run_info.x_step) { - int64_t offs = input_offset + x * sizeof(T); - for(uint32_t w = 0; w < run_info.weights_width; ++w) + VectorType acc = zero_vector; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for (uint32_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? - wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : - zero_vector; - const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); - acc = wrapper::vmla(acc, weights_vals, input_vals); + int64_t offs = input_offset + x * sizeof(T); + for (uint32_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? wrapper::vload(reinterpret_cast<T *>( + input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) + : zero_vector; + const auto weights_vals = + wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + acc = wrapper::vmla(acc, weights_vals, input_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } - offs += dilation.x() * run_info.input_stride_y; + if (has_biases) + { + const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc = wrapper::vadd(acc, biases_vals); } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); } - if(has_biases) + for (; x < run_info.x_end; ++x) { - const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); - acc = wrapper::vadd(acc, biases_vals); - } - - wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); - } + auto acc_scalar = T{0}; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; - for(; x < run_info.x_end; ++x) - { - auto acc_scalar = T{ 0 }; - auto weights_ptr = base_weights_ptr; - int64_t input_offset = base_input_offset; - - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int64_t offs = input_offset + x * sizeof(T); - for(size_t w = 0; w < run_info.weights_width; ++w) + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0; - const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); - - acc_scalar += (input_vals * weights_vals); - - offs += dilation.x() * run_info.input_stride_y; + int64_t offs = input_offset + x * sizeof(T); + for (size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = + is_valid_region + ? *reinterpret_cast<T *>(input_it.ptr() + + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) + : 0; + const auto weights_vals = + *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc_scalar += (input_vals * weights_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); - acc_scalar += biases_vals; + if (has_biases) + { + const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc_scalar += biases_vals; + } + *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; } - *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T> -void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, - const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) +void depthwise_loop_generic_fp(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + const Window &window, + bool has_biases) { - const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + const auto run_info = + DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); Window execution_window = window; execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); @@ -240,81 +276,98 @@ void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const Iterator output_it(dst, win_output); Iterator biases_it{}; - if(has_biases) + if (has_biases) { biases_it = Iterator(biases, win_weights); } - execute_window_loop(execution_window, [&](const Coordinates & id) - { - std::vector<T> acc(depth_multiplier, static_cast<T>(0)); + execute_window_loop( + execution_window, + [&](const Coordinates &id) + { + std::vector<T> acc(depth_multiplier, static_cast<T>(0)); - const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; - const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; - int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; - auto weights_ptr = weights_it.ptr(); - for(size_t h = 0; h < run_info.weights_height; ++h) - { - int offs = input_offset; - for(size_t w = 0; w < run_info.weights_width; ++w) + auto weights_ptr = weights_it.ptr(); + for (size_t h = 0; h < run_info.weights_height; ++h) { - const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); - const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0); - - for(size_t m = 0; m < depth_multiplier; ++m) + int offs = input_offset; + for (size_t w = 0; w < run_info.weights_width; ++w) { - const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); - acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = + is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), + run_info.input_max_offset))) + : T(0); + + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = + *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + } + + offs += dilation.x() * run_info.input_stride_y; } - offs += dilation.x() * run_info.input_stride_y; + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; } - weights_ptr += run_info.weights_stride_z; - input_offset += dilation.y() * run_info.input_stride_z; - } - - if(has_biases) - { - for(size_t m = 0; m < depth_multiplier; ++m) + if (has_biases) { - const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); - *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + for (size_t m = 0; m < depth_multiplier; ++m) + { + const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + } } - } - else - { - for(size_t m = 0; m < depth_multiplier; ++m) + else { - *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + for (size_t m = 0; m < depth_multiplier; ++m) + { + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + } } - } - }, - input_it, weights_it, biases_it, output_it); + }, + input_it, weights_it, biases_it, output_it); } template <typename T, typename TW> -void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void run_depthwise_float(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { PadStrideInfo conv_info = info.pad_stride_info; unsigned int depth_multiplier = info.depth_multiplier; Size2D dilation = info.dilation; - if(depth_multiplier == 1) + if (depth_multiplier == 1) { depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases); } else { - depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases); + depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, + has_biases); } } template <typename T, typename TW> -void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info); +void run_depthwise_quanitized8bit(const ITensor *src, + const ITensor *weights, + const ITensor *biases, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp index 1bf7ad7007..d32847c1e8 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp @@ -26,16 +26,26 @@ namespace arm_compute { namespace cpu { -void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info); } -void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp index 58f7536064..682fad0bda 100644 --- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp @@ -26,16 +26,26 @@ namespace arm_compute { namespace cpu { -void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); } -void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias, - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, + const ITensor *weights, + const ITensor *bias, + ITensor *dst, + const Window &window, + bool has_biases, + const ConvolutionInfo &info) { return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h index 44f055d6a9..cf80608f4f 100644 --- a/src/cpu/kernels/depthwiseconv2d/list.h +++ b/src/cpu/kernels/depthwiseconv2d/list.h @@ -27,9 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \ - void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \ - ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info) +#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name) \ + void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \ + const Window &window, bool has_biases, const ConvolutionInfo &info) DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative); DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative); DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative); diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h index 9a0472643d..5cbf7a36c6 100644 --- a/src/cpu/kernels/directconv2d/list.h +++ b/src/cpu/kernels/directconv2d/list.h @@ -32,8 +32,9 @@ namespace cpu { namespace kernels { -#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ - void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \ + const PadStrideInfo &conv_info) DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d); DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d); diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp index a719fa50d6..218a4b7ee4 100644 --- a/src/cpu/kernels/directconv2d/nchw/all.cpp +++ b/src/cpu/kernels/directconv2d/nchw/all.cpp @@ -22,18 +22,17 @@ * SOFTWARE. */ -#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" - -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" #include <algorithm> @@ -44,22 +43,26 @@ namespace cpu namespace kernels { template <typename T> -void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp16_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nchw<float16_t>(window, src, weights, dst, conv_info); } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp32_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nchw<float>(window, src, weights, dst, conv_info); } template <typename T> -void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(conv_info); @@ -107,72 +110,81 @@ void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weig constexpr int num_elems_read_per_iteration = 16 / sizeof(T); - execute_window_loop(window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(2); - const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - execute_window_loop(window_w, [&](const Coordinates & id_w) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - T out_temp = static_cast<T>(0); - - for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) - { - const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; - const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(2); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + execute_window_loop( + window_w, + [&](const Coordinates &id_w) { - const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; - const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; - int index_w = in_w_start; - int index_wei_w = wei_w_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_w < in_w_end; ++index_w, ++index_wei_w) + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + T out_temp = static_cast<T>(0); + + for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) { - const auto src_val = *(in_ptr_row + index_w * input_stride_w); - const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); - out_temp += src_val * w_val; + const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; + const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; + const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; + int index_w = in_w_start; + int index_wei_w = wei_w_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_w <= ((in_w_end - num_elems_read_per_iteration)); + index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_w < in_w_end; ++index_w, ++index_wei_w) + { + const auto src_val = *(in_ptr_row + index_w * input_stride_w); + const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp += src_val * w_val; + } + } } - } - } - *(reinterpret_cast<T *>(out_ptr)) = out_temp; - + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nchw<float16_t>( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nchw<float>( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp index 9982431de5..36a8e76f13 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp @@ -30,10 +30,11 @@ namespace cpu { namespace kernels { -void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void neon_fp32_nhwc_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { convolve_nhwc<float>(window, src, weights, dst, conv_info); } } // namespace kernels } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp index 500ad1b420..f235167e28 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp @@ -24,16 +24,16 @@ #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" -#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" -#include "src/core/NEON/wrapper/wrapper.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <algorithm> @@ -49,12 +49,14 @@ namespace { bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) { - return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); -} + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && + weights->padding().right == 0); } +} // namespace template <typename T> -void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) { // Declare useful types using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; @@ -97,7 +99,7 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig constexpr int num_elems_read_per_iteration = 16 / sizeof(T); // nhwc optimized - if(have_zero_x_internal_padding(src->info(), weights->info())) + if (have_zero_x_internal_padding(src->info(), weights->info())) { // This function assumes that input and weights have not padding in channel @@ -114,138 +116,154 @@ void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weig * multiplication works on the correct input/weight elements. */ execute_window_loop( - window_out, [&](const Coordinates & id) - { - /* + window_out, + [&](const Coordinates &id) + { + /* * In here we create theoretical indexes which then we validate for both * inputs and weights. * As a reminder, this loop take each output point in NHW, C is treated * in the weights loop. */ - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; - const int index_h_start = in_h_start - in_h_start_t; - const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; - const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - execute_window_loop( - window_w, [&](const Coordinates & id_w) - { - /* + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* * This is the loop in the weights, and it goes along N (the batches) * As a reminder, the batches of the weights are translated into the * channels of the output */ - const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) - + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; - const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast<T>(0); - for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) - { - const T *in_ptr_mover = in_ptr_row; - int index_wc = index_wc_start; - vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) - { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_row + index_wc); - out_temp += src_val * w_val; - } - } - *(reinterpret_cast<T *>(out_ptr)) = out_temp; + const T *in_ptr_row = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = + reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_h = index_h_start; index_h < index_h_end; + ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_wc <= index_wc_end - num_elems_read_per_iteration; + index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } else // nhwc non optimized { execute_window_loop( - window_out, [&](const Coordinates & id) - { - // We are computing the theoretical starting input starting points - const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; - const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; - const int in_w_end_t = in_w_start_t + kernel_dim_w; - const int in_h_end_t = in_h_start_t + kernel_dim_h; - - // We are computing the valid initial and ending input points by checking the borders - const int in_w_start = std::max(in_w_start_t, 0); - const int in_h_start = std::max(in_h_start_t, 0); - const int in_w_end = std::min(in_w_end_t, input_dim_w); - const int in_h_end = std::min(in_h_end_t, input_dim_h); - - // We use the input points to select the valid weight points to use - const int wei_w_start = in_w_start - in_w_start_t; - const int wei_h_start = in_h_start - in_h_start_t; - const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); - const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); - - const int index_c_end = weights->info()->dimension(0); - const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; - - execute_window_loop( - window_w, [&](const Coordinates & id_w) + window_out, + [&](const Coordinates &id) { - const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); - uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; - - T out_temp = static_cast<T>(0); - for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) - { - const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; - const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; - for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) { - const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; - const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; - int index_c = 0; - vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); - for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) - { - const auto src_vec = wrapper::vloadq(in_ptr_mover); - const auto w_vec = wrapper::vloadq(weights_ptr_mover); - out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); - } - out_temp += vreduce(out_temp_vec); - for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) { - const auto src_val = *(in_ptr_mover); - const auto w_val = *(weights_ptr_mover); - out_temp += src_val * w_val; + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_c <= index_c_end - num_elems_read_per_iteration; + index_c += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration, + weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } } - } - } - *(reinterpret_cast<T *>(out_ptr)) = out_temp; + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); }, - wei); - }, - out); + out); } } -template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +template void convolve_nhwc<float>( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h index 3b26fcdf29..efb9ce8e2a 100644 --- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h @@ -26,6 +26,7 @@ #define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H #include "arm_compute/core/ITensor.h" + #include "src/core/helpers/WindowHelpers.h" namespace arm_compute @@ -35,7 +36,8 @@ namespace cpu namespace kernels { template <typename T> -void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); } // namespace kernels } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp index 6091ef215e..9b4375f17c 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -35,14 +36,38 @@ void neon_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window); } -template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -50,12 +75,30 @@ void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor * return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window); } -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp index 2d8fec91c5..53ccd89dcc 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -34,25 +35,67 @@ void neon_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITenso return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window); } -template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window); } -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h index 98b154e8fd..98f7e8b949 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h @@ -39,7 +39,7 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = wrapper::vmax(a, b); @@ -71,7 +71,9 @@ typename VectorType::type elementwise_arithm_op(const typename VectorType::type } template <ArithmeticOperation op, typename ScalarType, typename VectorType> -typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) +typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, + const ScalarType &broadcast_value, + const bool reorder) { using tag_type = typename VectorType::tag_type; using vec_type = typename VectorType::type; @@ -81,10 +83,15 @@ typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorT } template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> -void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), - int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), - int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +void elementwise_op( + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)( + int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -99,7 +106,7 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -114,20 +121,26 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = *(non_broadcast_input_ptr + x); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); - } - }, - broadcast_input, non_broadcast_input, output); + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_value, output_ptr, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, + !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -139,21 +152,23 @@ void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = (*scalar_func)(a, b); - } - }, - input1, input2, output); + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for (; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); } } @@ -162,7 +177,7 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar { auto res = ScalarType(0); - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = std::max(a, b); @@ -183,10 +198,10 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar case ArithmeticOperation::DIV: { res = a / b; - if(std::is_integral<ScalarType>::value) + if (std::is_integral<ScalarType>::value) { res = (b == 0) ? 0 : res; - if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0))) + if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0))) { --res; } @@ -205,43 +220,56 @@ inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const Scalar } template <> -inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b) +inline int32x4_t +elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, + const int32x4_t &b) { return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); } template <> -inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +inline float32x4_t +elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, + const float32x4_t &b) { return wrapper::vdiv(a, b); } template <> -inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +inline float32x4_t +elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, + const float32x4_t &b) { return wrapper::vpow(a, b); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> -inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>( + const float16x8_t &a, const float16x8_t &b) { return wrapper::vdiv(a, b); } template <> -inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +inline float16x8_t +elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>( + const float16x8_t &a, const float16x8_t &b) { return wrapper::vpow(a, b); } #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <ArithmeticOperation op, typename ScalarType, typename VectorType> -inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) +inline int elementwise_arithm_op_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *input1_ptr, + const ScalarType *input2_ptr, + ScalarType *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -251,14 +279,20 @@ inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int } template <ArithmeticOperation op, typename ScalarType, typename VectorType> -inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const ScalarType *non_broadcast_input_ptr, + const ScalarType &broadcast_value, + ScalarType *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); - wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); + wrapper::vstore(output_ptr + x, + elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); } return x; } @@ -268,10 +302,10 @@ void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, { using scalar_type = typename VectorType::scalar_type; - elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window, - &elementwise_arithm_op_scalar<op, scalar_type>, - &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, - &elementwise_arithm_op_loop<op, scalar_type, VectorType>); + elementwise_op<scalar_type, scalar_type, VectorType>( + in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>, + &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, + &elementwise_arithm_op_loop<op, scalar_type, VectorType>); } template <ComparisonOperation op, typename InputScalarType> @@ -279,7 +313,7 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS { bool res = false; - switch(op) + switch (op) { case ComparisonOperation::Equal: res = (a == b); @@ -308,9 +342,9 @@ inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputS template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType> inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) { - OutputVectorType res = { 0, 0, 0, 0 }; + OutputVectorType res = {0, 0, 0, 0}; - switch(op) + switch (op) { case ComparisonOperation::Equal: res = wrapper::vceq(a, b); @@ -338,53 +372,75 @@ inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const Inpu } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType> -inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +inline OutputVectorType +elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) { InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); - return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); + return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, + reorder ? a : broadcast_vector); } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); wrapper::vstore(output_ptr + x, a); } return x; } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); } return x; } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *non_broadcast_input_ptr, + const InputScalarType &broadcast_value, + uint8_t *output_ptr, + const bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); - const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); } - if(x <= window_end_x - 4) + if (x <= window_end_x - 4) { - const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); - for(int i = 0; i < 4; i++) + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>( + wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for (int i = 0; i < 4; i++) { *(output_ptr + x + i) = wrapper::vgetlane(a, i); } @@ -394,11 +450,15 @@ inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_ } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_8_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -409,11 +469,15 @@ inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_16_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); @@ -424,11 +488,15 @@ inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> -inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, - const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +inline int elementwise_comp_op_32_loop(int window_start_x, + int window_end_x, + int window_step_x, + const InputScalarType *input1_ptr, + const InputScalarType *input2_ptr, + uint8_t *output_ptr) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { auto a = wrapper::vloadq(input1_ptr + x); auto b = wrapper::vloadq(input2_ptr + x); @@ -438,12 +506,12 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); } - if(x <= window_end_x - 4) + if (x <= window_end_x - 4) { const auto a = wrapper::vloadq(input1_ptr + x); const auto b = wrapper::vloadq(input2_ptr + x); const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { *(output_ptr + x + i) = wrapper::vgetlane(res, i); } @@ -455,57 +523,59 @@ inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); } template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { - elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, - &elementwise_comp_op_scalar<op, InputScalarType>, - &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, - &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); + elementwise_op<InputScalarType, uint8_t, InputVectorType>( + in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); } inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { - qasymm8x16_t x = vld1q_u8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), - } - }; + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), + scale), + vmulq_f32( + vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + vmulq_f32(vcvtq_f32_s32( + vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), + scale), + }}; return out; } inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) { - qasymm8x16_signed_t x = vld1q_s8(input1_ptr); - const float32x4x4_t out = - { - { - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), - } - }; + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = {{ + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + }}; return out; } @@ -523,17 +593,15 @@ inline void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) vst1q_u8(output_ptr, vcombine_u8(pa, pb)); } -inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void +store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) { - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; store_quantized(output_ptr, out); } @@ -544,17 +612,17 @@ inline void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) vst1q_s8(output_ptr, vcombine_s8(pa, pb)); } -inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +inline void store_quantized_signed(int8_t *output_ptr, + const float32x4x4_t &rf, + const float32x4_t &offset, + const float32x4_t &invscale) { - int32x4x4_t out = - { - { - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), - vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), - } - }; + int32x4x4_t out = {{ + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + }}; store_quantized_signed(output_ptr, out); } @@ -565,7 +633,8 @@ inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const floa } template <ArithmeticOperation op> -inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +inline int8_t +elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) { return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo); } @@ -574,15 +643,12 @@ template <ArithmeticOperation op> float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) { using neon_vector_float = wrapper::traits::neon_vector<float, 4>; - float32x4x4_t out = - { - { - elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]), - elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]), - elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]), - elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]), - } - }; + float32x4x4_t out = {{ + elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]), + elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]), + elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]), + elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]), + }}; return out; } @@ -596,26 +662,29 @@ inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float template <ComparisonOperation op> inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) { - uint32x4x4_t out = - { - { - elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]), - elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]), - elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]), - elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3]) - } - }; + uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}}; return out; } template <ArithmeticOperation op> -inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_arithm_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { // Get inputs and compute output const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); @@ -627,13 +696,21 @@ inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_e } template <ArithmeticOperation op> -inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + int8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { // Get inputs and compute output const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); @@ -645,45 +722,71 @@ inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int w } template <ArithmeticOperation op> -inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf, voffseto, invvscaleo); } return x; } template <ArithmeticOperation op> -inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + int8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = + elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); } return x; } template <ComparisonOperation op> -inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_comp_op_quantized_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *input1_ptr, + const uint8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); @@ -694,14 +797,22 @@ inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end } template <ComparisonOperation op> -inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, - int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, - float32x4_t voffseto, float32x4_t invvscaleo) +inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *input1_ptr, + const int8_t *input2_ptr, + uint8_t *output_ptr, + int32x4_t voffset1, + int32x4_t voffset2, + float32x4_t vscale1, + float32x4_t vscale2, + float32x4_t voffseto, + float32x4_t invvscaleo) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); @@ -712,46 +823,85 @@ inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int win } template <ComparisonOperation op> -inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const uint8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf); } return x; } template <ComparisonOperation op> -inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, - const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, - int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, - float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, + int window_end_x, + int window_step_x, + const int8_t *non_broadcast_input_ptr, + float32x4x4_t broadcast_vector, + uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, + float32x4_t vscale_non_broadcast, + float32x4_t voffseto, + float32x4_t invvscaleo, + bool reorder) { ARM_COMPUTE_UNUSED(voffseto, invvscaleo); int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); - const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + const float32x4x4_t af = + load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = + elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); store_quantized(output_ptr + x, rf); } return x; } -inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, +inline void elementwise_op_quantized(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) + int (*broadcast_func)(int, + int, + int, + const uint8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const uint8_t *, + const uint8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -772,7 +922,7 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -794,24 +944,28 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -834,32 +988,56 @@ inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITe Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } -inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void +elementwise_comp_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + uint8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + uint8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -879,7 +1057,7 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -901,24 +1079,28 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -941,32 +1123,56 @@ inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } -inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), - int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, - float32x4_t, float32x4_t, const bool), - int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, - int32x4_t, int32x4_t, float32x4_t, float32x4_t, - float32x4_t, float32x4_t)) +inline void +elementwise_op_quantized_signed(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, + int, + int, + const int8_t *, + float32x4x4_t, + int8_t *, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + const bool), + int (*neon_func)(int, + int, + int, + const int8_t *, + const int8_t *, + int8_t *, + int32x4_t, + int32x4_t, + float32x4_t, + float32x4_t, + float32x4_t, + float32x4_t)) { // Create input windows Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); @@ -986,7 +1192,7 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { // Select the broadcast input on the X axis const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -1008,24 +1214,28 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); - const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); - int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, - voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); - for(; x < window_end_x; ++x) - { - const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); - const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); - *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); - } - }, - broadcast_input, non_broadcast_input, output); + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, + broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast, + voffseto, invvscaleo, !is_broadcast_input_2); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, + !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -1048,22 +1258,24 @@ inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *i Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - - int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, - vscale1, vscale2, voffseto, invvscaleo); - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); - const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); - *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, + voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo); + for (; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); } } diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp index c5c528d3f3..09ad13d5eb 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute { @@ -33,63 +34,165 @@ void neon_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window); } -template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ArithmeticOperation op> void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window); } -template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window); } -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window); } -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window); } -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -} +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp index fa8e08745a..d891f70644 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute { @@ -33,27 +34,72 @@ void neon_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe return elementwise_arithm_op_quantized<op>(in1, in2, out, window); } -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> -void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comp_op_quantized<op>(in1, in2, out, window); } -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp index abfdf93b75..b1f8e018f5 100644 --- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h" namespace arm_compute @@ -34,27 +35,70 @@ void neon_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window); } -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> -void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window); } -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp index 85224351df..600c7f1c05 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp @@ -25,6 +25,7 @@ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -36,14 +37,38 @@ void sve_fp16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window); } -template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) @@ -51,14 +76,32 @@ void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_comparison_op<float16_t>(in1, in2, out, op, window); } -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp index 2b479f76f1..832a966883 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -34,26 +35,68 @@ void sve_fp32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window); } -template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op<float>(in1, in2, out, op, window); } -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp index c0515f2abc..fa48407e9b 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp @@ -23,7 +23,9 @@ */ #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" + #include "src/core/NEON/SVEMath.h" + #include <arm_sve.h> namespace arm_compute @@ -33,7 +35,8 @@ namespace cpu using namespace arm_compute::wrapper; template <typename ScalarType> -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { using VectorType = typename sve_vector<ScalarType>::type; @@ -51,7 +54,7 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -66,37 +69,40 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const auto broadcast_vector = svdup_n(broadcast_value); - - int x = window_start_x; - - svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); - VectorType res{}; + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); - if(is_broadcast_input_2) - { - res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op); - } - else + int x = window_start_x; + + svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); + do { - res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op); - } - svst1(pg, output_ptr + x, res); - - x += svcnt<ScalarType>(); - pg = svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + VectorType res{}; + + if (is_broadcast_input_2) + { + res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, + broadcast_vector, op); + } + else + { + res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(pg, output_ptr + x, res); + + x += svcnt<ScalarType>(); + pg = svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -108,39 +114,46 @@ void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto in1 = svld1(pg, input1_ptr + x); - const auto in2 = svld1(pg, input2_ptr + x); - const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op); - svst1(pg, output_ptr + x, res); - - x += svcnt<ScalarType>(); - pg = svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op); + svst1(pg, output_ptr + x, res); + + x += svcnt<ScalarType>(); + pg = svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void elementwise_arithmetic_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); -template void elementwise_arithmetic_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<float32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<float16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<int16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); +template void elementwise_arithmetic_op<int32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window); template <typename InputScalarType, typename OutputScalarType> -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); using OutputVectorType = typename sve_vector<OutputScalarType>::type; const auto all_true_pg = svptrue<InputScalarType>(); @@ -157,7 +170,7 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -172,37 +185,44 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - const auto broadcast_vector = svdup_n(broadcast_value); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + const auto broadcast_vector = svdup_n(broadcast_value); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); - do - { - const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); - const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); - OutputVectorType res{}; - if(is_broadcast_input_2) - { - res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op); - } - else + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do { - res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op); - } - svst1(output_pg, output_ptr + x, res); - - x += svcnt<InputScalarType>(); - pg = svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + OutputVectorType res{}; + if (is_broadcast_input_2) + { + res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>( + pg, non_broadcast_vector, broadcast_vector, op); + } + else + { + res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>( + pg, broadcast_vector, non_broadcast_vector, op); + } + svst1(output_pg, output_ptr + x, res); + + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -214,37 +234,45 @@ void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor * Iterator input2(in2, input2_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); - do - { - const auto in1 = svld1(pg, input1_ptr + x); - const auto in2 = svld1(pg, input2_ptr + x); - const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op); - const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); - svst1(output_pg, output_ptr + x, res); - - x += svcnt<InputScalarType>(); - pg = svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto in1 = svld1(pg, input1_ptr + x); + const auto in2 = svld1(pg, input2_ptr + x); + const auto res = + elementwise_comparison_op<typename sve_vector<InputScalarType>::type, + typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, output_ptr + x, res); + + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } -template void elementwise_comparison_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op<uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); -template void elementwise_comparison_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<float32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<float16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<uint8_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<int16_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); +template void elementwise_comparison_op<int32_t>( + const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window); template <> svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h index 860c50a1e0..4c61b9f315 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/core/NEON/wrapper/svtraits.h" @@ -51,7 +52,7 @@ svbool_t narrow_to_byte_predicate(svbool_t pg) { const auto all_false = svpfalse(); - switch(bytewidth) + switch (bytewidth) { case 8: pg = svuzp1_b32(pg, all_false); @@ -74,7 +75,7 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve using ScalarType = typename wrapper::sve_scalar<VectorType>::type; VectorType res{}; - switch(op) + switch (op) { case ArithmeticOperation::MAX: res = svmax_z(pg, a, b); @@ -114,11 +115,12 @@ VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const Ve } template <typename InputVectorType, typename OutputVectorType> -OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +OutputVectorType +elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) { svbool_t selection_vector{}; - switch(op) + switch (op) { case ComparisonOperation::Equal: selection_vector = svcmpeq(pg, a, b); @@ -154,10 +156,12 @@ OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType & } template <typename ScalarType> -void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window); +void elementwise_arithmetic_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window); template <typename ScalarType, typename OutputScalarType = uint8_t> -void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window); +void elementwise_comparison_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window); } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp index c313fc6e04..f7714ff7e9 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h" namespace arm_compute { @@ -33,64 +34,166 @@ void sve_s32_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor { return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window); } -template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ArithmeticOperation op> void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window); } -template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window); } -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op<int16_t>(in1, in2, out, op, window); } -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { return elementwise_comparison_op<int32_t>(in1, in2, out, op, window); } -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h index 41e0ac77db..7c6015d379 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -35,19 +35,14 @@ inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint3 { auto x = svld1(pg, ptr); - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); pg = svptrue_b8(); - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); } inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) @@ -56,28 +51,24 @@ inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint //vprint(x); - const auto widened = svcreate4( - svmovlb(svmovlb(x)), - svmovlt(svmovlb(x)), - svmovlb(svmovlt(x)), - svmovlt(svmovlt(x))); + const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x))); pg = svptrue_b8(); - return svcreate4( - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), - svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); + return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); } -inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void +store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); @@ -85,13 +76,14 @@ inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svst1(pg, ptr, narrowed); } -inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +inline void +store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) { - const auto quantized = svcreate4( - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), - svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + const auto quantized = + svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); @@ -101,7 +93,8 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const } template <typename ScalarType> -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) +void elementwise_arithmetic_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { const auto all_true_pg = wrapper::svptrue<ScalarType>(); @@ -120,7 +113,7 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -128,8 +121,10 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); @@ -141,48 +136,52 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); - const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); - const float broadcast_value_f = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo); - const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - - svfloat32x4_t result{}; - - if(!is_broadcast_input_2) + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do { - result = svcreate4( - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op)); - } - else - { - result = svcreate4( - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); - } - - store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svfloat32x4_t result{}; + + if (!is_broadcast_input_2) + { + result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -200,41 +199,44 @@ void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); - const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); - - const auto result = svcreate4( - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); - - store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + + const auto result = + svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } template <typename InputScalarType, typename OutputScalarType = uint8_t> -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) +void elementwise_comparison_quantized_op( + const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), + "input data type's width should be equal to or greater than output data type's width"); using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; const auto all_true_pg = wrapper::svptrue<InputScalarType>(); @@ -251,7 +253,7 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -259,8 +261,10 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); - const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + const auto non_broadcast_qinfo = + is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = + is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); @@ -272,51 +276,63 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); - const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); - const float broadcast_value_f = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo); - const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); - - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - - svuint8x4_t result{}; - - if(!is_broadcast_input_2) + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = + reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = + *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = + Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), + svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do { - result = svcreate4( - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op)); - } - else - { - result = svcreate4( - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op)); - } - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, output_ptr + x, zipped); - - x += wrapper::svcnt<InputScalarType>(); - pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - broadcast_input, non_broadcast_input, output); + const auto in1 = + load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svuint8x4_t result{}; + + if (!is_broadcast_input_2) + { + result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), + svget4(in1, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), + svget4(in1, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), + svget4(in1, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>( + pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>( + pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -334,39 +350,44 @@ void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); - const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); - int x = window_start_x; + int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); - do - { - const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); - const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); - const auto result = svcreate4( - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op)); - - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, output_ptr + x, zipped); - - x += wrapper::svcnt<InputScalarType>(); - pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input1, input2, output); + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + const auto result = + svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), + svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), + svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), + svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), + svget4(in2, 3), op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input1, input2, output); } } } // namespace cpu } // namespace arm_compute -#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
\ No newline at end of file +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp index 7435bb4f29..5cc66642d7 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" namespace arm_compute { @@ -34,27 +35,72 @@ void sve2_qasymm8_elementwise_binary(const ITensor *in1, const ITensor *in2, ITe return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window); } -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> -void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window); } -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp index 1027a1eed0..165e0c05fa 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h" namespace arm_compute { @@ -34,27 +35,70 @@ void sve2_qasymm8_signed_elementwise_binary(const ITensor *in1, const ITensor *i return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window); } -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); template <ComparisonOperation op> -void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window) { return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window); } -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); -template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>( + const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); +template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp index b2833c2481..2588db024d 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp @@ -23,17 +23,19 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<__fp16>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp index 6566821eca..936a2e588a 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<float>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h index dbc1dde4fa..d54d3984cb 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h +++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" @@ -36,7 +37,7 @@ namespace cpu template <typename ScalarType> inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return 1 / sqrt(a); @@ -60,7 +61,7 @@ inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarTyp template <typename ScalarType, typename VectorType> inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return wrapper::vinvsqrt(a); @@ -94,22 +95,24 @@ inline void elementwise_op(const ITensor *in, ITensor *out, const Window &window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); - } - for(; x < window_end_x; ++x) - { - *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); - } - }, - input, output); + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); + } + for (; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); } template <> @@ -128,75 +131,81 @@ inline void elementwise_op<int8_t>(const ITensor *in, ITensor *out, const Window Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - int8x16_t vout; - auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); - const auto vconst_0_f32 = vdupq_n_f32(0); - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); - - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + int8x16_t vout; + auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto vconst_0_f32 = vdupq_n_f32(0); + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - // Perform activation - float32x4x4_t vtmp_deq = + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) { - { + const auto vin = wrapper::vloadq(input_ptr + x); + + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp<float>(op, vin_deq.val[0]), elementwise_op_imp<float>(op, vin_deq.val[1]), elementwise_op_imp<float>(op, vin_deq.val[2]), elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); + // Re-quantize to new output space + vout = vquantize_signed(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); } - - // Re-quantize to new output space - vout = vquantize_signed(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); - qasymm8_signed_t tmp = 0; - float tmp_f = dequantize_qasymm8_signed(in, qi_in); - if(tmp_f <= 0.0) + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) - { - tmp_f = (-128 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (127 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (-128 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (127 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); } + tmp = quantize_qasymm8_signed( + tmp_f, qi_out, + RoundingPolicy:: + TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. + // For aarch64 LUT is used and rounding to nearest is used + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); - } - tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a. - // For aarch64 LUT is used and rounding to nearest is used - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } template <> inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) @@ -215,71 +224,74 @@ inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Windo Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - uint8x16_t vout; - auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; - auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); - int x = window_start_x; - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = wrapper::vloadq(input_ptr + x); + uint8x16_t vout; + auto clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value; + auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + int x = window_start_x; + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); - // De-quantize - const auto vin_deq = vdequantize(vin, qi_in); + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); - // Perform activation - float32x4x4_t vtmp_deq = - { - { + // Perform activation + float32x4x4_t vtmp_deq = {{ elementwise_op_imp<float>(op, vin_deq.val[0]), elementwise_op_imp<float>(op, vin_deq.val[1]), elementwise_op_imp<float>(op, vin_deq.val[2]), elementwise_op_imp<float>(op, vin_deq.val[3]), + }}; + if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) + { + vtmp_deq.val[0] = + vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); + vtmp_deq.val[1] = + vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); + vtmp_deq.val[2] = + vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); + vtmp_deq.val[3] = + vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); } - }; - if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT)) - { - vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]); - vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]); - vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]); - vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]); - } - // Re-quantize to new output space - vout = vquantize(vtmp_deq, qi_out); - wrapper::vstore(output_ptr + x, vout); - } - for(; x < window_end_x; ++x) - { - qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); - qasymm8_t tmp = 0; - float tmp_f = dequantize_qasymm8(in, qi_in); - if(tmp_f <= 0.0) + // Re-quantize to new output space + vout = vquantize(vtmp_deq, qi_out); + wrapper::vstore(output_ptr + x, vout); + } + for (; x < window_end_x; ++x) { - if(op == ElementWiseUnary::LOG) + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + float tmp_f = dequantize_qasymm8(in, qi_in); + if (tmp_f <= 0.0) { - tmp_f = (0 - qi_out.offset) * qi_out.scale; - } - else if(op == ElementWiseUnary::RSQRT) - { - tmp_f = (255 - qi_out.offset) * qi_out.scale; + if (op == ElementWiseUnary::LOG) + { + tmp_f = (0 - qi_out.offset) * qi_out.scale; + } + else if (op == ElementWiseUnary::RSQRT) + { + tmp_f = (255 - qi_out.offset) * qi_out.scale; + } + else + { + tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); + } } else { tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); } + tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); + *(output_ptr + x) = tmp; } - else - { - tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f); - } - tmp = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO); - *(output_ptr + x) = tmp; - } - }, - input, output); + }, + input, output); } } // namespace cpu diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp index dfe5e30035..d4daad4ca6 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp @@ -22,16 +22,18 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<int32_t>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp index 08bb7f28b6..38cb61d0ff 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute @@ -32,24 +33,28 @@ namespace cpu #ifdef __aarch64__ -void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(op); - auto win = window; + auto win = window; const auto window_end_x = window.x().end(); win.set(0, Window::Dimension(0, 1, 1)); Iterator src_it(in, win); Iterator dst_it(out, win); - execute_window_loop(win, [&](const Coordinates &) { - const auto src_ptr = src_it.ptr(); - auto dst_ptr = dst_it.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); - lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); - }, - src_it, dst_it); + lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); } #endif // __aarch64__ diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp index d987f7747b..3e4b88eb47 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<uint8_t>(in, out, window, op); diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp index e00970a1e0..a5f4b053e3 100644 --- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Window.h" + #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h" namespace arm_compute @@ -31,7 +32,8 @@ namespace cpu { #ifndef __aarch64__ // Fallback function to be used for armv7a, for aarch64 LUT is used -void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void neon_qasymm8_signed_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_op<int8_t>(in, out, window, op); diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp index a883309b2e..22ff43c5d9 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" @@ -30,11 +31,12 @@ namespace arm_compute { namespace cpu { -void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_fp16_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op<float16_t>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp index b21ed8ddbc..394bd47adf 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" @@ -30,10 +31,11 @@ namespace arm_compute { namespace cpu { -void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_fp32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op<float32_t>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp index a948862906..5af534d9e7 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute @@ -31,9 +32,10 @@ namespace arm_compute namespace cpu { template <typename ScalarType, typename VectorType> -inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: return svinvsqrt(pg, a); @@ -55,9 +57,10 @@ inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::val } template <typename ScalarType, typename VectorType> -inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type +elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) { - switch(op) + switch (op) { case ElementWiseUnary::NEG: return svneg_z(pg, a); @@ -81,23 +84,24 @@ void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, E Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - int x = window_start_x; - - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &) { - const auto vin = svld1(pg, input_ptr + x); - svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); - }, - input, output); + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); + }, + input, output); } template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp index 068c3f7cda..e27fe5a87f 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp @@ -23,16 +23,18 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve_s32_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(lut); return elementwise_sve_op<int32_t>(in, out, window, op); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp index 7e32f50132..4e4582debb 100644 --- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp +++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp @@ -23,13 +23,15 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/lut/list.h" namespace arm_compute { namespace cpu { -void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) +void sve2_q8_elementwise_unary( + const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut) { ARM_COMPUTE_UNUSED(op); @@ -40,14 +42,16 @@ void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &wi Iterator src_it(in, win); Iterator dst_it(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto src_ptr = src_it.ptr(); - auto dst_ptr = dst_it.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto src_ptr = src_it.ptr(); + auto dst_ptr = dst_it.ptr(); - lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr); - }, - src_it, dst_it); + lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr); + }, + src_it, dst_it); } } // namespace cpu diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h index 4367e0ffc9..5ac78df324 100644 --- a/src/cpu/kernels/floor/list.h +++ b/src/cpu/kernels/floor/list.h @@ -28,8 +28,7 @@ namespace arm_compute { namespace cpu { -#define DECLARE_FLOOR_KERNEL(func_name) \ - void func_name(const void *src, void *dst, int len) +#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len) DECLARE_FLOOR_KERNEL(fp16_neon_floor); DECLARE_FLOOR_KERNEL(fp32_neon_floor); diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp index f362676a36..f47690277d 100644 --- a/src/cpu/kernels/floor/neon/fp16.cpp +++ b/src/cpu/kernels/floor/neon/fp16.cpp @@ -45,14 +45,14 @@ void fp16_neon_floor(const void *src, void *dst, int len) auto psrc = static_cast<const __fp16 *>(src); auto pdst = static_cast<__fp16 *>(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); psrc += step; pdst += step; } - for(; len > 0; --len) + for (; len > 0; --len) { *pdst = std::floor(*psrc); ++psrc; diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp index f5efb2e849..a86e24d3c3 100644 --- a/src/cpu/kernels/floor/neon/fp32.cpp +++ b/src/cpu/kernels/floor/neon/fp32.cpp @@ -43,14 +43,14 @@ void fp32_neon_floor(const void *src, void *dst, int len) auto psrc = static_cast<const float *>(src); auto pdst = static_cast<float *>(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); psrc += step; pdst += step; } - for(; len > 0; --len) + for (; len > 0; --len) { *pdst = std::floor(*psrc); ++pdst; diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp index a29ee762fc..2821af32ce 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp @@ -29,11 +29,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv_f16(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp index 076e97651d..3ca5b6977a 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp @@ -28,11 +28,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv_f32(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h index b9017600d6..6fa843263a 100644 --- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h +++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -32,8 +33,16 @@ namespace arm_compute namespace cpu { template <typename T> -void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_conv(const ITensor *conv_weights, + const ITensor *conv_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / conv_weights->info()->element_size(); @@ -53,13 +62,20 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor * Iterator conv_w_in(conv_weights, win); Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win); - const auto conv_bias_in = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto conv_bias_in = + (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto conv_bias_out = + (run_in_place_bias ? conv_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -73,59 +89,61 @@ void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor * auto gamma = ScalarType(1.0); auto beta = ScalarType(0.0); auto conv_bias_in_scalar = ScalarType(0.0); - execute_window_loop(win, [&](const Coordinates & id) - { - var = input_var[id[3]]; - if(input_gamma != nullptr) + execute_window_loop( + win, + [&](const Coordinates &id) { - gamma = input_gamma[id[3]]; - } + var = input_var[id[3]]; + if (input_gamma != nullptr) + { + gamma = input_gamma[id[3]]; + } - if((id[0] == 0) && (id[1] == 0) && (id[2] == 0)) - { - if(input_beta != nullptr) + if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0)) { - beta = input_beta[id[3]]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + if (input_beta != nullptr) + { + beta = input_beta[id[3]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Construct vectors + mean = input_mean[id[3]]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + + if (conv_bias_in != nullptr) + { + conv_bias_in_scalar = conv_bias_in[id[3]]; + } + auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta; } - // Construct vectors - mean = input_mean[id[3]]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + int x = window_start_x; + auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr()); + auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - if(conv_bias_in != nullptr) + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - conv_bias_in_scalar = conv_bias_in[id[3]]; - } - auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - conv_bias_out[id[3]] = (conv_bias_tmp_scalar * gamma) + beta; - } - - int x = window_start_x; - auto conv_w_in_ptr = reinterpret_cast<const ScalarType *>(conv_w_in.ptr()); - auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr()); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto wn = wrapper::vloadq(conv_w_in_ptr + x); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + auto wn = wrapper::vloadq(conv_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(conv_w_out_ptr + x, wn); - } + // Store results + wrapper::vstore(conv_w_out_ptr + x, wn); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - conv_w_in, conv_w_out); -} -} + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + conv_w_in, conv_w_out); } +} // namespace cpu +} // namespace arm_compute #endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h index e25b1e5fed..a03dd74f78 100644 --- a/src/cpu/kernels/fuse_batch_normalization/list.h +++ b/src/cpu/kernels/fuse_batch_normalization/list.h @@ -30,15 +30,18 @@ namespace cpu { #define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name) \ void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name) \ void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name) \ void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \ - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) + const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, \ + float epsilon, const Window &window) DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16); DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32); @@ -50,7 +53,7 @@ DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(fused_batch_normalization_dwc_ #undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL -} -} +} // namespace cpu +} // namespace arm_compute -#endif //
\ No newline at end of file +#endif // diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp index 1e3be8792d..c0b0dfd4dc 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp @@ -29,8 +29,16 @@ namespace arm_compute namespace cpu { template <typename T> -void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / dwc_weights->info()->element_size(); @@ -50,13 +58,20 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso Iterator dwc_w_in(dwc_weights, win); Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); - const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -70,74 +85,92 @@ void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITenso auto gamma = ScalarType(1.0); auto beta = ScalarType(0.0); auto dwc_bias_in_scalar = ScalarType(0.0); - execute_window_loop(win, [&](const Coordinates & id) - { - var = input_var[id[2]]; - if(input_gamma != nullptr) + execute_window_loop( + win, + [&](const Coordinates &id) { - gamma = input_gamma[id[2]]; - } - - if(id[1] == 0) - { - mean = input_mean[id[2]]; - - // Construct vectors - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - if(input_beta != nullptr) + var = input_var[id[2]]; + if (input_gamma != nullptr) { - beta = input_beta[id[2]]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + gamma = input_gamma[id[2]]; } - if(dwc_bias_in != nullptr) + if (id[1] == 0) { - dwc_bias_in_scalar = dwc_bias_in[id[2]]; + mean = input_mean[id[2]]; + + // Construct vectors + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + if (input_beta != nullptr) + { + beta = input_beta[id[2]]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[id[2]]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta; } - auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - dwc_bias_out[id[2]] = (dwc_bias_tmp_scalar * gamma) + beta; - } + int x = window_start_x; + auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - int x = window_start_x; - auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto wn = wrapper::vloadq(dwc_w_in_ptr + x); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(dwc_w_out_ptr + x, wn); - } + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - dwc_w_in, dwc_w_out); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); } -void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp index 275211ff38..1d88d3b494 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp @@ -30,11 +30,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp index 67169c5325..1f336bb196 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp @@ -29,11 +29,19 @@ namespace arm_compute { namespace cpu { -void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { - return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, - bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window); + return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean, + bn_var, bn_beta, bn_gamma, epsilon, window); } } // namespace cpu diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h index 6f0386276f..5b74a7aef6 100644 --- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h +++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -32,8 +33,16 @@ namespace arm_compute namespace cpu { template <typename T> -void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window) +void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, + const ITensor *dwc_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window) { using ScalarType = T; const int size = 16 / dwc_weights->info()->element_size(); @@ -53,13 +62,20 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso Iterator dwc_w_in(dwc_weights, win); Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win); - const auto dwc_bias_in = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); - auto dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); + const auto dwc_bias_in = + (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr); + auto dwc_bias_out = + (run_in_place_bias ? dwc_bias_in + : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0)))); const auto input_mean = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0))); const auto input_var = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_gamma = (bn_gamma != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) + : nullptr; + const auto input_beta = (bn_beta != nullptr) + ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) + : nullptr; auto mean_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); auto var_vec = wrapper::vdup_n(ScalarType(0), ExactTagType{}); @@ -73,81 +89,84 @@ void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITenso auto beta = ScalarType(0.0); auto dwc_bias_in_scalar = ScalarType(0); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - var_vec = wrapper::vloadq(input_var + x); - if(input_gamma != nullptr) - { - gamma_vec = wrapper::vloadq(input_gamma + x); - } - - if((id[2] == 0) && (id[1] == 0)) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - mean_vec = wrapper::vloadq(input_mean + x); - - // Construct vectors - if(input_beta != nullptr) + var_vec = wrapper::vloadq(input_var + x); + if (input_gamma != nullptr) { - beta_vec = wrapper::vloadq(input_beta + x); + gamma_vec = wrapper::vloadq(input_gamma + x); } - if(dwc_bias_in != nullptr) + if ((id[2] == 0) && (id[1] == 0)) { - dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x); + mean_vec = wrapper::vloadq(input_mean + x); + + // Construct vectors + if (input_beta != nullptr) + { + beta_vec = wrapper::vloadq(input_beta + x); + } + + if (dwc_bias_in != nullptr) + { + dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x); + } + + auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), + wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec))); + dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec); + wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec); } - auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec))); - dwc_bias_tmp_vec = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec); - wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec); - } - - auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); - auto wn = wrapper::vloadq(dwc_w_in_ptr + x); - rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - wn = wrapper::vmul(wn, rvar_vec); - wn = wrapper::vmul(wn, gamma_vec); + auto wn = wrapper::vloadq(dwc_w_in_ptr + x); + rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + wn = wrapper::vmul(wn, rvar_vec); + wn = wrapper::vmul(wn, gamma_vec); - // Store results - wrapper::vstore(dwc_w_out_ptr + x, wn); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto var = input_var[x]; - if(input_gamma != nullptr) - { - gamma = input_gamma[x]; + // Store results + wrapper::vstore(dwc_w_out_ptr + x, wn); } - if(id[2] == 0 && id[1] == 0) + // Compute left-over elements + for (; x < window_end_x; ++x) { - auto mean = input_mean[x]; - if(input_beta != nullptr) + auto var = input_var[x]; + if (input_gamma != nullptr) { - beta = input_beta[x]; + gamma = input_gamma[x]; } - if(dwc_bias_in != nullptr) + + if (id[2] == 0 && id[1] == 0) { - dwc_bias_in_scalar = dwc_bias_in[x]; + auto mean = input_mean[x]; + if (input_beta != nullptr) + { + beta = input_beta[x]; + } + if (dwc_bias_in != nullptr) + { + dwc_bias_in_scalar = dwc_bias_in[x]; + } + + auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); + dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta; } - auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon)); - dwc_bias_out[x] = (dwc_bias_tmp_scalar * gamma) + beta; - } - - const auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); - auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); + const auto dwc_w_in_ptr = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr()); + auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr()); - *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; - } - }, - dwc_w_in, dwc_w_out); + *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma; + } + }, + dwc_w_in, dwc_w_out); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp index 505a37174e..4d7507a5da 100644 --- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp @@ -48,30 +48,32 @@ void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr()); - const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); - const float16x8x2_t c = vld2q_f16(in_ptr + x); - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); - alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); + const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); - vst2q_f16(out_ptr + x, alpha_ab); - } + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); + const float16x8x2_t c = vld2q_f16(in_ptr + x); + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); + alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta); - } - }, - in, out); + vst2q_f16(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta); + } + }, + in, out); } } // namespace void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta) diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp index dd0384ca13..47de0f3928 100644 --- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp +++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h" + #include <arm_neon.h> namespace arm_compute @@ -44,33 +45,35 @@ void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const float *>(in.ptr()); - const auto out_ptr = reinterpret_cast<float *>(out.ptr()); - - int x = window_start_x; - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); - const float32x4x4_t c = vld4q_f32(in_ptr + x); + const auto in_ptr = reinterpret_cast<const float *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); - // Multiply matrix C by its weight and accumulate - alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); - alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); - alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); - alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); + int x = window_start_x; + for (; x < (window_end_x - window_step_x); x += window_step_x) + { + float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); + const float32x4x4_t c = vld4q_f32(in_ptr + x); - vst4q_f32(out_ptr + x, alpha_ab); - } + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); + alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); + alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); + alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); - // Left-over loop - for(; x < window_end_x; ++x) - { - *(out_ptr + x) += *(in_ptr + x) * beta; - } - }, - in, out); + vst4q_f32(out_ptr + x, alpha_ab); + } + + // Left-over loop + for (; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * beta; + } + }, + in, out); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp index 8fd79f9287..60fda511e3 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp @@ -32,7 +32,8 @@ namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void vector_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size()); @@ -42,7 +43,8 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const int window_start_x = 32 * info.thread_id; const int window_step_x = 32 * info.num_threads; const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; - ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x"); + ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, + " (window_end_x - window_start_x) must be multiple of window_step_x"); Window win_out(window); win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -55,7 +57,7 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -70,169 +72,172 @@ void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_out, + [&](const Coordinates &) { - if(x > width_matrix_b) + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) { - return; - } - - auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + if (x > width_matrix_b) + { + return; + } - float16x8_t acc0 = vdupq_n_f16(0.f); - float16x8_t acc1 = vdupq_n_f16(0.f); - float16x8_t acc2 = vdupq_n_f16(0.f); - float16x8_t acc3 = vdupq_n_f16(0.f); + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; - auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - const float16x4_t a0l = vld1_f16(vec_a); - - float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); - - matrix_b += 2 * in_b_stride; - - b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); - b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); - b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); - acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); - acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); - acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); - acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); - - vec_a += 4; - matrix_b += 2 * in_b_stride; - } + float16x8_t acc0 = vdupq_n_f16(0.f); + float16x8_t acc1 = vdupq_n_f16(0.f); + float16x8_t acc2 = vdupq_n_f16(0.f); + float16x8_t acc3 = vdupq_n_f16(0.f); - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); - const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); - const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); - const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); - - acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); - acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); - acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); - acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); - - matrix_b += in_b_stride; - } + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + const float16x4_t a0l = vld1_f16(vec_a); + + float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); + + matrix_b += 2 * in_b_stride; + + b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); + + vec_a += 4; + matrix_b += 2 * in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f16(acc0, alpha_f16); - acc1 = vmulq_f16(acc1, alpha_f16); - acc2 = vmulq_f16(acc2, alpha_f16); - acc3 = vmulq_f16(acc3, alpha_f16); - } + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); + acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); + acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); + acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); + + matrix_b += in_b_stride; + } - auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f16(acc0, alpha_f16); + acc1 = vmulq_f16(acc1, alpha_f16); + acc2 = vmulq_f16(acc2, alpha_f16); + acc3 = vmulq_f16(acc3, alpha_f16); + } - vst1q_f16(vec_out + 0, acc0); - vst1q_f16(vec_out + 8, acc1); - vst1q_f16(vec_out + 16, acc2); - vst1q_f16(vec_out + 24, acc3); - } + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) - { - return; + vst1q_f16(vec_out + 0, acc0); + vst1q_f16(vec_out + 8, acc1); + vst1q_f16(vec_out + 16, acc2); + vst1q_f16(vec_out + 24, acc3); } - auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + for (; x < window_end_x; ++x) + { + if (x > width_matrix_b) + { + return; + } - float16x4_t vacc = vdup_n_f16(0.f); + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; - auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); - const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float16x4_t a0l = vld1_f16(vec_a); + float16x4_t vacc = vdup_n_f16(0.f); - const float16x4_t b_col = + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; + const float16x4_t a0l = vld1_f16(vec_a); - vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); + const float16x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; - matrix_b += 4 * in_b_stride; - } + vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); - float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); + matrix_b += 4 * in_b_stride; + } - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float16_t a0 = *vec_a; - const float16_t b00 = *matrix_b; + float16_t acc = + vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); - acc += b00 * a0; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16_t b00 = *matrix_b; - matrix_b += in_b_stride; - } + acc += b00 * a0; - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= static_cast<float16_t>(alpha); - } + matrix_b += in_b_stride; + } - auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= static_cast<float16_t>(alpha); + } - *(vec_out) = acc; - } - }, - ina, inb, out); + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + + *(vec_out) = acc; + } + }, + ina, inb, out); } -void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void matrix_matrix_multiply_f16( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast<int>(dst->info()->dimension(0)); - const int out_height = static_cast<int>(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); const int num_elems_matrix_b_x = rhs->info()->dimension(0); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix @@ -243,7 +248,7 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -259,22 +264,16 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor const float16x8_t alpha_f16 = vdupq_n_f16(alpha); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr()); - const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr()); - auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr()); - float16x8x4_t c = + execute_window_loop( + window, + [&](const Coordinates &id) { - { - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f), - vdupq_n_f16(0.f) - } - }; + const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr()); + const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr()); + auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr()); + float16x8x4_t c = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}}; - /* + /* This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) |a00 a01 a02 a03 | a04 a05 a06 a07| |a10 a11 a12 a13 | a14 a15 a16 a17| @@ -302,111 +301,118 @@ void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. */ - const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - - { - const float16x8_t p00 = vld1q_f16(mtx_a0); - const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); - - const float16x8_t q00 = vld1q_f16(mtx_b0); - const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); - const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); - const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); - - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); - - mtx_a0 += 16; - mtx_b0 += 32; - } + { + const float16x8_t p00 = vld1q_f16(mtx_a0); + const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); + + const float16x8_t q00 = vld1q_f16(mtx_b0); + const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); + const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); + const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); + + mtx_a0 += 16; + mtx_b0 += 32; + } - for(; mtx_b0 < mtx_b0_end_addr;) + for (; mtx_b0 < mtx_b0_end_addr;) - { - const float16x4_t p00 = vld1_f16(mtx_a0); - const float16x8_t q00 = vld1q_f16(mtx_b0); + { + const float16x4_t p00 = vld1_f16(mtx_a0); + const float16x8_t q00 = vld1q_f16(mtx_b0); - c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); - c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); - c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); - c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); - mtx_a0 += 4; - mtx_b0 += 8; - } + mtx_a0 += 4; + mtx_b0 += 8; + } - if(multiply_alpha) - { - c.val[0] = vmulq_f16(c.val[0], alpha_f16); - c.val[1] = vmulq_f16(c.val[1], alpha_f16); - c.val[2] = vmulq_f16(c.val[2], alpha_f16); - c.val[3] = vmulq_f16(c.val[3], alpha_f16); - } + if (multiply_alpha) + { + c.val[0] = vmulq_f16(c.val[0], alpha_f16); + c.val[1] = vmulq_f16(c.val[1], alpha_f16); + c.val[2] = vmulq_f16(c.val[2], alpha_f16); + c.val[3] = vmulq_f16(c.val[3], alpha_f16); + } - if(id.x() < (out_width - 8)) - { - vst1q_f16(mtx_out, c.val[0]); - if(id.y() + 1 < out_height) + if (id.x() < (out_width - 8)) { - vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); - if(id.y() + 2 < out_height) + vst1q_f16(mtx_out, c.val[0]); + if (id.y() + 1 < out_height) { - vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); - if(id.y() + 3 < out_height) + vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); + if (id.y() + 2 < out_height) { - vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); + if (id.y() + 3 < out_height) + { + vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + } } } } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) + else { - *(mtx_out + x) = c.val[0][x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) { - *(mtx_out + x + 1 * out_stride) = c.val[1][x]; - if(id.y() + 2 < out_height) + *(mtx_out + x) = c.val[0][x]; + if (id.y() + 1 < out_height) { - *(mtx_out + x + 2 * out_stride) = c.val[2][x]; - if(id.y() + 3 < out_height) + *(mtx_out + x + 1 * out_stride) = c.val[1][x]; + if (id.y() + 2 < out_height) { - *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + *(mtx_out + x + 2 * out_stride) = c.val[2][x]; + if (id.y() + 3 < out_height) + { + *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } -void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +void neon_fp16_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) { - return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha); + return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha); } -} // namespce cpu +} // namespace cpu } // namespace arm_compute #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp index 9c1f6f3c0f..e12a312280 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace cpu { -void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +void neon_fp32_gemm_matrix_mul(const ITensor *lhs, + const ITensor *rhs, + ITensor *dst, + const Window &window, + const ThreadInfo &info, + float alpha, + const bool is_dst_vector) { - return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha); + return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) + : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha); } -} // namespce cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp index 0051d3d9dc..404d070a37 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h" + #include "src/core/utils/helpers/float_ops.h" #include <arm_neon.h> @@ -31,10 +32,12 @@ namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { - const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); - const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); + const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = + static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0)); // The implementation computes 16 elements per iteration @@ -54,7 +57,7 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -69,209 +72,220 @@ void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor const float32x4_t alpha_f32 = vdupq_n_f32(alpha); - execute_window_loop(win_out, [&](const Coordinates &) - { - int x = window_start_x; - // Here we don't check for x lower equal than (window_end_x - window_step_x) because of - // window_end_x is computed above which may cause out-of-bound writes to the dst. - for(; x < (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_out, + [&](const Coordinates &) { - if(x > width_matrix_b) + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for (; x < (window_end_x - window_step_x); x += window_step_x) { - return; - } + if (x > width_matrix_b) + { + return; + } - float32x4_t acc0 = vdupq_n_f32(0.f); - float32x4_t acc1 = vdupq_n_f32(0.f); - float32x4_t acc2 = vdupq_n_f32(0.f); - float32x4_t acc3 = vdupq_n_f32(0.f); + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); - auto vec_a = reinterpret_cast<const float *>(ina.ptr()); - auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); #endif /* __arm__ */ - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4);) - { - float32x2_t a0l = vld1_f32(vec_a); + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); - float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); #endif /* __arm__ */ - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - vec_a += 2; - matrix_b += 2 * in_b_stride; + vec_a += 2; + matrix_b += 2 * in_b_stride; - a0l = vld1_f32(vec_a); + a0l = vld1_f32(vec_a); - b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); - b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); - b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); - b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); - acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); - acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); - acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); - acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); - acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); - acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); - acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); - acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); - vec_a += 2; - matrix_b += 2 * in_b_stride; - } + vec_a += 2; + matrix_b += 2 * in_b_stride; + } - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; - const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); - const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); - const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); - const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); - acc0 = vmlaq_n_f32(acc0, b00, a0); - acc1 = vmlaq_n_f32(acc1, b01, a0); - acc2 = vmlaq_n_f32(acc2, b02, a0); - acc3 = vmlaq_n_f32(acc3, b03, a0); + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); - matrix_b += in_b_stride; - } + matrix_b += in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc0 = vmulq_f32(acc0, alpha_f32); - acc1 = vmulq_f32(acc1, alpha_f32); - acc2 = vmulq_f32(acc2, alpha_f32); - acc3 = vmulq_f32(acc3, alpha_f32); - } + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc0 = vmulq_f32(acc0, alpha_f32); + acc1 = vmulq_f32(acc1, alpha_f32); + acc2 = vmulq_f32(acc2, alpha_f32); + acc3 = vmulq_f32(acc3, alpha_f32); + } - const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; - vst1q_f32(vec_out + 0, acc0); - vst1q_f32(vec_out + 4, acc1); - vst1q_f32(vec_out + 8, acc2); - vst1q_f32(vec_out + 12, acc3); - } + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + } - // Left-over loop - for(; x < window_end_x; ++x) - { - if(x > width_matrix_b) + // Left-over loop + for (; x < window_end_x; ++x) { - return; - } + if (x > width_matrix_b) + { + return; + } - float32x4_t vacc = vdupq_n_f32(0.f); + float32x4_t vacc = vdupq_n_f32(0.f); - auto vec_a = reinterpret_cast<const float *>(ina.ptr()); - auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); #endif /* __arm__ */ - auto vec_a_end_addr = vec_a + num_elems_vec_a; - for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) - { - const float32x4_t a0l = vld1q_f32(vec_a); - - const float32x4_t b_col = + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4) { - *(matrix_b + 0 * in_b_stride), - *(matrix_b + 1 * in_b_stride), - *(matrix_b + 2 * in_b_stride), - *(matrix_b + 3 * in_b_stride), - }; + const float32x4_t a0l = vld1q_f32(vec_a); + + const float32x4_t b_col = { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile( + "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); #endif /* __arm__ */ - vacc = vmlaq_f32(vacc, b_col, a0l); + vacc = vmlaq_f32(vacc, b_col, a0l); - matrix_b += 4 * in_b_stride; - } + matrix_b += 4 * in_b_stride; + } - float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3); + float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + + vgetq_lane_f32(vacc, 3); - for(; vec_a < vec_a_end_addr; ++vec_a) - { - const float a0 = *vec_a; + for (; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; - const float b00 = *matrix_b; + const float b00 = *matrix_b; - acc += b00 * a0; + acc += b00 * a0; - matrix_b += in_b_stride; - } + matrix_b += in_b_stride; + } - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc *= alpha; - } + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc *= alpha; + } - const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; - *vec_out = acc; - } - }, - ina, inb, out); + *vec_out = acc; + } + }, + ina, inb, out); } -void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) { ARM_COMPUTE_UNUSED(info); - const int out_width = static_cast<int>(dst->info()->dimension(0)); - const int out_height = static_cast<int>(dst->info()->dimension(1)); - const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); - const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); - const size_t out_stride2 = out_stride1 * 2; - const size_t out_stride3 = out_stride1 * 3; + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; const int num_elems_matrix_b_x = rhs->info()->dimension(0); // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix @@ -282,7 +296,7 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor Window win_b; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the the matrix multiplication is used to perform a convolution operation - if(rhs->info()->num_dimensions() >= 3) + if (rhs->info()->num_dimensions() >= 3) { win_b = window; } @@ -302,338 +316,340 @@ void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration // All the values needed for computing a single 4x4 block will be read from consecutive memory positions - execute_window_loop(window, [&](const Coordinates & id) - { - auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr()); - auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr()); - auto mtx_b1 = mtx_b0 + in_b_stride; + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr()); + auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; - float32x4_t acc00 = vdupq_n_f32(0.f); - float32x4_t acc10 = vdupq_n_f32(0.f); - float32x4_t acc20 = vdupq_n_f32(0.f); - float32x4_t acc30 = vdupq_n_f32(0.f); + float32x4_t acc00 = vdupq_n_f32(0.f); + float32x4_t acc10 = vdupq_n_f32(0.f); + float32x4_t acc20 = vdupq_n_f32(0.f); + float32x4_t acc30 = vdupq_n_f32(0.f); - float32x4_t acc01 = vdupq_n_f32(0.f); - float32x4_t acc11 = vdupq_n_f32(0.f); - float32x4_t acc21 = vdupq_n_f32(0.f); - float32x4_t acc31 = vdupq_n_f32(0.f); + float32x4_t acc01 = vdupq_n_f32(0.f); + float32x4_t acc11 = vdupq_n_f32(0.f); + float32x4_t acc21 = vdupq_n_f32(0.f); + float32x4_t acc31 = vdupq_n_f32(0.f); #if __arm__ - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; - for(; mtx_b0 <= (mtx_b0_end_addr - 32);) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + for (; mtx_b0 <= (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); - float32x4_t b01 = vld1q_f32(mtx_b0 + 4); - float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); - float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); - float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); - float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); #if __arm__ - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - - a0 = vld1q_dup_f32(mtx_a0 + 0); - a1 = vld1q_dup_f32(mtx_a0 + 1); - a2 = vld1q_dup_f32(mtx_a0 + 2); - a3 = vld1q_dup_f32(mtx_a0 + 3); - b00 = vld1q_f32(mtx_b0); - b10 = vld1q_f32(mtx_b1); - b01 = vld1q_f32(mtx_b0 + 4); - b11 = vld1q_f32(mtx_b1 + 4); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - a4 = vld1q_dup_f32(mtx_a0 + 4); - a5 = vld1q_dup_f32(mtx_a0 + 5); - a6 = vld1q_dup_f32(mtx_a0 + 6); - a7 = vld1q_dup_f32(mtx_a0 + 7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b01, a4); - acc10 = vmlaq_f32(acc10, b01, a5); - acc20 = vmlaq_f32(acc20, b01, a6); - acc30 = vmlaq_f32(acc30, b01, a7); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b11, a4); - acc11 = vmlaq_f32(acc11, b11, a5); - acc21 = vmlaq_f32(acc21, b11, a6); - acc31 = vmlaq_f32(acc31, b11, a7); - - mtx_a0 += 8; - mtx_b0 += 8; - mtx_b1 += 8; - } - - for(; mtx_b0 < mtx_b0_end_addr;) - { - float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); - float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); - float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); - float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); - float32x4_t b00 = vld1q_f32(mtx_b0); - float32x4_t b10 = vld1q_f32(mtx_b1); + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + for (; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); #if __arm__ - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); - asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); #endif /* __arm__ */ - // 4x4 block 0 - acc00 = vmlaq_f32(acc00, b00, a0); - acc10 = vmlaq_f32(acc10, b00, a1); - acc20 = vmlaq_f32(acc20, b00, a2); - acc30 = vmlaq_f32(acc30, b00, a3); - - // 4x4 block 1 - acc01 = vmlaq_f32(acc01, b10, a0); - acc11 = vmlaq_f32(acc11, b10, a1); - acc21 = vmlaq_f32(acc21, b10, a2); - acc31 = vmlaq_f32(acc31, b10, a3); - - mtx_a0 += 4; - mtx_b0 += 4; - mtx_b1 += 4; - } - - // Multiply by the weight of matrix product (alpha) - if(multiply_alpha) - { - acc00 = vmulq_f32(acc00, alpha_f32); - acc10 = vmulq_f32(acc10, alpha_f32); - acc20 = vmulq_f32(acc20, alpha_f32); - acc30 = vmulq_f32(acc30, alpha_f32); - acc01 = vmulq_f32(acc01, alpha_f32); - acc11 = vmulq_f32(acc11, alpha_f32); - acc21 = vmulq_f32(acc21, alpha_f32); - acc31 = vmulq_f32(acc31, alpha_f32); - } - - const auto mtx_out0 = reinterpret_cast<float *>(out.ptr()); - const auto mtx_out1 = mtx_out0 + 4; - - if(id.x() < (out_width - 8)) - { - vst1q_f32(mtx_out0, acc00); - vst1q_f32(mtx_out1, acc01); - if(id.y() + 1 < out_height) + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } + + // Multiply by the weight of matrix product (alpha) + if (multiply_alpha) + { + acc00 = vmulq_f32(acc00, alpha_f32); + acc10 = vmulq_f32(acc10, alpha_f32); + acc20 = vmulq_f32(acc20, alpha_f32); + acc30 = vmulq_f32(acc30, alpha_f32); + acc01 = vmulq_f32(acc01, alpha_f32); + acc11 = vmulq_f32(acc11, alpha_f32); + acc21 = vmulq_f32(acc21, alpha_f32); + acc31 = vmulq_f32(acc31, alpha_f32); + } + + const auto mtx_out0 = reinterpret_cast<float *>(out.ptr()); + const auto mtx_out1 = mtx_out0 + 4; + + if (id.x() < (out_width - 8)) { - vst1q_f32(mtx_out0 + out_stride1, acc10); - vst1q_f32(mtx_out1 + out_stride1, acc11); - if(id.y() + 2 < out_height) + vst1q_f32(mtx_out0, acc00); + vst1q_f32(mtx_out1, acc01); + if (id.y() + 1 < out_height) { - vst1q_f32(mtx_out0 + out_stride2, acc20); - vst1q_f32(mtx_out1 + out_stride2, acc21); - if(id.y() + 3 < out_height) + vst1q_f32(mtx_out0 + out_stride1, acc10); + vst1q_f32(mtx_out1 + out_stride1, acc11); + if (id.y() + 2 < out_height) { - vst1q_f32(mtx_out0 + out_stride3, acc30); - vst1q_f32(mtx_out1 + out_stride3, acc31); + vst1q_f32(mtx_out0 + out_stride2, acc20); + vst1q_f32(mtx_out1 + out_stride2, acc21); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out1 + out_stride3, acc31); + } } } } - } - else if(id.x() < (out_width - 4)) - { - vst1q_f32(mtx_out0, acc00); - if(id.y() + 1 < out_height) + else if (id.x() < (out_width - 4)) { - vst1q_f32(mtx_out0 + out_stride1, acc10); - if(id.y() + 2 < out_height) + vst1q_f32(mtx_out0, acc00); + if (id.y() + 1 < out_height) { - vst1q_f32(mtx_out0 + out_stride2, acc20); - if(id.y() + 3 < out_height) + vst1q_f32(mtx_out0 + out_stride1, acc10); + if (id.y() + 2 < out_height) { - vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out0 + out_stride2, acc20); + if (id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + } } } - } - // Left-over columns - const int columns_left = out_width - id.x() - 4; - for(auto x = 0; x < columns_left; ++x) - { - *(mtx_out1 + x) = acc01[x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x() - 4; + for (auto x = 0; x < columns_left; ++x) { - *(mtx_out1 + x + out_stride1) = acc11[x]; - if(id.y() + 2 < out_height) + *(mtx_out1 + x) = acc01[x]; + if (id.y() + 1 < out_height) { - *(mtx_out1 + x + out_stride2) = acc21[x]; - if(id.y() + 3 < out_height) + *(mtx_out1 + x + out_stride1) = acc11[x]; + if (id.y() + 2 < out_height) { - *(mtx_out1 + x + out_stride3) = acc31[x]; + *(mtx_out1 + x + out_stride2) = acc21[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out1 + x + out_stride3) = acc31[x]; + } } } } } - } - else - { - // Left-over columns - const int columns_left = out_width - id.x(); - for(int x = 0; x < columns_left; ++x) + else { - *(mtx_out0 + x) = acc00[x]; - if(id.y() + 1 < out_height) + // Left-over columns + const int columns_left = out_width - id.x(); + for (int x = 0; x < columns_left; ++x) { - *(mtx_out0 + x + out_stride1) = acc10[x]; - if(id.y() + 2 < out_height) + *(mtx_out0 + x) = acc00[x]; + if (id.y() + 1 < out_height) { - *(mtx_out0 + x + out_stride2) = acc20[x]; - if(id.y() + 3 < out_height) + *(mtx_out0 + x + out_stride1) = acc10[x]; + if (id.y() + 2 < out_height) { - *(mtx_out0 + x + out_stride3) = acc30[x]; + *(mtx_out0 + x + out_stride2) = acc20[x]; + if (id.y() + 3 < out_height) + { + *(mtx_out0 + x + out_stride3) = acc30[x]; + } } } } } - } - }, - ina, inb, out); + }, + ina, inb, out); } } // namespace cpu diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h index f9f1f247ac..74ea4c2b17 100644 --- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h +++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h @@ -24,15 +24,18 @@ #ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H #define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/CPP/Validate.h" namespace arm_compute { namespace cpu { -void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); +void vector_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); -void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); +void matrix_matrix_multiply_f32( + const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h index 9cdb58ae06..15b23b1d81 100644 --- a/src/cpu/kernels/gemm_matrix_mul/list.h +++ b/src/cpu/kernels/gemm_matrix_mul/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \ - void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector) +#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \ + void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \ + float alpha, const bool is_dst_vector) DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul); DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul); #undef DECLARE_GEMMMATRIXMUL_KERNEL diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp index d4e469b691..4ed7e54f1c 100644 --- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp @@ -27,10 +27,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_fp16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp index 09aa6ecec4..f15cd63bb2 100644 --- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp @@ -26,9 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_fp32_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp index 9224e32a94..8cb76f3afb 100644 --- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp @@ -28,7 +28,10 @@ class ITensor; class Window; namespace cpu { -void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { Iterator all_anchors_it(all_anchors, window); Iterator anchors_it(all_anchors, window); @@ -39,28 +42,30 @@ void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform(); - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; - const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); - const size_t shift_idy = id.y() / num_anchors; - const float shiftx = (shift_idy % feat_width) * stride; - const float shifty = (shift_idy / feat_width) * stride; + const size_t shift_idy = id.y() / num_anchors; + const float shiftx = (shift_idy % feat_width) * stride; + const float shifty = (shift_idy / feat_width) * stride; - const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; - const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; - const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; - const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; + const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx; + const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty; + const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx; + const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty; - *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); - *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); - *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); - *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); - }, - all_anchors_it); + *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale); + *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale); + *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale); + *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale); + }, + all_anchors_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h index da052c9192..3317bcfbe6 100644 --- a/src/cpu/kernels/genproposals/generic/neon/impl.h +++ b/src/cpu/kernels/genproposals/generic/neon/impl.h @@ -26,13 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template <typename T> -void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void compute_all_anchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { Iterator all_anchors_it(all_anchors, window); Iterator anchors_it(all_anchors, window); @@ -41,26 +45,31 @@ void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAn const T stride = 1.f / anchors_info.spatial_scale(); const size_t feat_width = anchors_info.feat_width(); - execute_window_loop(window, [&](const Coordinates & id) - { - const size_t anchor_offset = id.y() % num_anchors; + execute_window_loop( + window, + [&](const Coordinates &id) + { + const size_t anchor_offset = id.y() % num_anchors; - const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr()); - const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); + const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr()); + const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset))); - const size_t shift_idy = id.y() / num_anchors; - const T shiftx = (shift_idy % feat_width) * stride; - const T shifty = (shift_idy / feat_width) * stride; + const size_t shift_idy = id.y() / num_anchors; + const T shiftx = (shift_idy % feat_width) * stride; + const T shifty = (shift_idy / feat_width) * stride; - *out_anchor_ptr = *anchor_ptr + shiftx; - *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; - *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; - *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; - }, - all_anchors_it); + *out_anchor_ptr = *anchor_ptr + shiftx; + *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty; + *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx; + *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty; + }, + all_anchors_it); } -void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window); +void compute_all_anchors_qasymm16(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp index cfb5a41d6e..7182d0b27d 100644 --- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp +++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp @@ -26,9 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window) +void neon_qu16_computeallanchors(const ITensor *anchors, + ITensor *all_anchors, + ComputeAnchorsInfo anchors_info, + const Window &window) { return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp index 2b7d91b144..44418c0bb9 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/instancenorm/generic/neon/impl.h" @@ -40,7 +41,10 @@ void vector_float_sum_fp16(AccType &result, AccType &result_square, const InputT } template <typename InputType, typename AccType> -InputType vector_float_norm_fp16(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +InputType vector_float_norm_fp16(const InputType &inputs, + const AccType &vec_mean, + const AccType &vec_multip, + const AccType &vec_beta) { return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); } @@ -52,19 +56,24 @@ inline void vector_float_sum_fp16(float32x4_t &result, float32x4_t &result_squar vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs))); } template <> -inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta) +inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, + const float32x4_t &vec_mean, + const float32x4_t &vec_multip, + const float32x4_t &vec_beta) { - const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs)); - const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs)); - const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta)); - const auto result_high = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta)); - float16x8_t result = wrapper::vcombine(result_low, result_high); + const auto input_low = wrapper::vcvt<float>(wrapper::vgetlow(inputs)); + const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs)); + const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta)); + const auto result_high = + wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta)); + float16x8_t result = wrapper::vcombine(result_low, result_high); return result; } template <typename AccType> -void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +void instance_normalization_nchw_fp16( + const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>; @@ -78,91 +87,105 @@ void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, flo const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<AccType>(0.f); - auto sum_squares_h_w = static_cast<AccType>(0.f); - - execute_window_loop(win_plane, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); - } - - auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); - - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto value = static_cast<AccType>(*(input_ptr + x)); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); - const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); - - execute_window_loop(win_plane, [&](const Coordinates &) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - const auto vec_val = wrapper::vloadq(input_ptr + x); - const auto normalized_vec = vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); - wrapper::vstore(output_ptr + x, normalized_vec); - } - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto val = static_cast<AccType>(*(input_ptr + x)); - *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta); - } + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<AccType>(0.f); + auto sum_squares_h_w = static_cast<AccType>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast<AccType>(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = + vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast<AccType>(*(input_ptr + x)); + *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); }, - input_plane_it, output_plane_it); - }, - input_it); -} + input_it); } - -void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +} // namespace + +void neon_fp16_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) { - if(use_mixed_precision) + if (use_mixed_precision) { return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window); } diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp index 061dd9585c..e1ca05518d 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp @@ -26,7 +26,13 @@ namespace arm_compute { namespace cpu { -void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +void neon_fp32_instancenorm(ITensor *input, + ITensor *output, + float gamma, + float beta, + float epsilon, + bool use_mixed_precision, + const Window &window) { ARM_COMPUTE_UNUSED(use_mixed_precision); return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window); diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp index 483b6f568b..515079e1b5 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/instancenorm/generic/neon/impl.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -38,13 +39,15 @@ void vector_float_sum(AccType &result, AccType &result_square, const InputType & } template <typename InputType, typename AccType> -InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta) { return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta); } template <typename T, typename AccType> -void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; @@ -58,88 +61,96 @@ void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, f const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - Window win_plane = window; - win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); - win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); - win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); - - Iterator input_plane_it(input, win_plane); - Iterator output_plane_it(output, win_plane); - - auto sum_h_w = static_cast<AccType>(0.f); - auto sum_squares_h_w = static_cast<AccType>(0.f); - - execute_window_loop(win_plane, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); - - auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); - - // Compute S elements per iteration - int x = window.x().start(); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - auto vec_input_val = wrapper::vloadq(input_ptr + x); - vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); - } - - auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); - auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); - - vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); - vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); - - sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); - sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto value = static_cast<AccType>(*(input_ptr + x)); - sum_h_w += value; - sum_squares_h_w += value * value; - } - }, - input_plane_it, output_plane_it); - - const auto mean_h_w = sum_h_w / elements_plane; - const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; - - const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); - const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); - const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); - const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); - - execute_window_loop(win_plane, [&](const Coordinates &) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); - auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); - - // Compute S elements per iteration - int x = window.x().start(); - //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); - for(; x <= (window.x().end() - window_step_x); x += window_step_x) - { - const auto vec_val = wrapper::vloadq(input_ptr + x); - const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); - wrapper::vstore(output_ptr + x, normalized_vec); - } - - // Compute left-over elements - for(; x < window.x().end(); ++x) - { - const auto val = static_cast<AccType>(*(input_ptr + x)); - *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta); - } + Window win_plane = window; + win_plane.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1)); + win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1)); + + Iterator input_plane_it(input, win_plane); + Iterator output_plane_it(output, win_plane); + + auto sum_h_w = static_cast<AccType>(0.f); + auto sum_squares_h_w = static_cast<AccType>(0.f); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr()); + + auto vec_sum_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{}); + + // Compute S elements per iteration + int x = window.x().start(); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + auto vec_input_val = wrapper::vloadq(input_ptr + x); + vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val); + } + + auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w)); + auto vec2_sum_squares_h_w = + wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w)); + + vec2_sum_h_w = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w); + vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w); + + sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0); + sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0); + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto value = static_cast<AccType>(*(input_ptr + x)); + sum_h_w += value; + sum_squares_h_w += value * value; + } + }, + input_plane_it, output_plane_it); + + const auto mean_h_w = sum_h_w / elements_plane; + const auto var_h_w = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w; + + const auto multip_h_w = gamma / std::sqrt(var_h_w + epsilon); + const auto vec_mean_h_w = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{}); + const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{}); + const auto vec_beta = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{}); + + execute_window_loop( + win_plane, + [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<T *>(input_plane_it.ptr()); + auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr()); + + // Compute S elements per iteration + int x = window.x().start(); + //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{}); + for (; x <= (window.x().end() - window_step_x); x += window_step_x) + { + const auto vec_val = wrapper::vloadq(input_ptr + x); + const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta); + wrapper::vstore(output_ptr + x, normalized_vec); + } + + // Compute left-over elements + for (; x < window.x().end(); ++x) + { + const auto val = static_cast<AccType>(*(input_ptr + x)); + *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta); + } + }, + input_plane_it, output_plane_it); }, - input_plane_it, output_plane_it); - }, - input_it); + input_it); } -template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); +template void instance_normalization_nchw<float>( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h index 0ddfcdd5ba..e1cc7487f7 100644 --- a/src/cpu/kernels/instancenorm/generic/neon/impl.h +++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h @@ -32,13 +32,15 @@ namespace arm_compute namespace cpu { template <typename T, typename AccType = T> -void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); +void instance_normalization_nchw( + ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); template <typename InputType, typename AccType = InputType> void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs); template <typename InputType, typename AccType = InputType> -InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta); +InputType +vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta); } // namespace cpu } // namespace arm_compute #endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h index 54f1d3213f..51b496c41d 100644 --- a/src/cpu/kernels/instancenorm/list.h +++ b/src/cpu/kernels/instancenorm/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_INSTANCENORM_KERNEL(func_name) \ - void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window) +#define DECLARE_INSTANCENORM_KERNEL(func_name) \ + void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \ + const Window &window) DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm); DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm); #undef DECLARE_INSTANCENORM_KERNEL diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp index b503a8b734..32d9ca4eac 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -24,18 +24,17 @@ #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" -#include "src/core/utils/AssemblyUtils.h" - #include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include "src/core/utils/AssemblyUtils.h" #include "depthwise_common.hpp" - #include <arm_neon.h> namespace arm_compute @@ -54,9 +53,13 @@ constexpr unsigned int idx_channels = 0; constexpr unsigned int idx_batches = 3; template <typename TSrc, typename TWeights, typename TDst> -void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, - std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name) +void create_arm_dwc(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, + std::string &_name) { unsigned int stride_cols{}; unsigned int stride_rows{}; @@ -79,13 +82,13 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); // Configure assembly pooling kernel auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args); - if(dwc_kernel_asm == nullptr) + if (dwc_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -96,11 +99,16 @@ void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorI } template <typename TSrc, typename TWeights, typename TDst> -void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info, +void create_arm_dwc_quant(const ITensorInfo *src, + const ITensorInfo *weights, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info, std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, - std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts, - std::string &_name) + std::vector<int32_t> &multipliers, + std::vector<int32_t> &right_shifts, + std::vector<int32_t> &left_shifts, + std::string &_name) { unsigned int stride_cols{}; unsigned int stride_rows{}; @@ -123,9 +131,9 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); - arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols, - n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, - padding, activation, nullptr); + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels, + dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr); const auto src_qinfo = src->quantization_info().uniform(); const auto weights_qinfo = weights->quantization_info(); @@ -135,64 +143,50 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT multipliers.resize(num_filters); std::vector<int32_t> dst_shifts(num_filters); - quantization::compute_quantized_multipliers_and_shifts(src, - weights, - dst, - multipliers.data(), - dst_shifts.data()); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data()); // Quantize activation bounds int32_t min_activation = std::numeric_limits<TSrc>::lowest(); int32_t max_activation = std::numeric_limits<TSrc>::max(); - if(info.act_info.enabled()) + if (info.act_info.enabled()) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); } // Set quantization parameters for assembly kernels arm_gemm::Requantize32 requant_args{}; - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { left_shifts.resize(num_filters); right_shifts.resize(num_filters); bool need_left_shift = false; // Select more optimized path if left shift is not needed - for(unsigned int i = 0; i < num_filters; ++i) + for (unsigned int i = 0; i < num_filters; ++i) { left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0)); right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0)); - if(dst_shifts[i] < 0 && !need_left_shift) + if (dst_shifts[i] < 0 && !need_left_shift) { need_left_shift = true; } } - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - (need_left_shift) ? left_shifts.data() : nullptr, - right_shifts.data(), - multipliers.data(), - static_cast<TSrc>(min_activation), - static_cast<TSrc>(max_activation)); + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr, + right_shifts.data(), multipliers.data(), + static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation)); } else { - requant_args = arm_gemm::Requantize32(nullptr, - 0, - src_qinfo.offset, - weights_qinfo.uniform().offset, - dst_qinfo.offset, - -dst_shifts[0], - multipliers[0], - static_cast<TSrc>(min_activation), - static_cast<TSrc>(max_activation)); + requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset, + dst_qinfo.offset, -dst_shifts[0], multipliers[0], + static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation)); } // Configure assembly pooling kernel with requantization - auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args); - if(dwc_kernel_asm == nullptr) + auto dwc_kernel_asm = + arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args); + if (dwc_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -203,18 +197,18 @@ void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, IT } // namespace CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() - : _kernel_asm(nullptr), - _multipliers(), - _left_shifts(), - _right_shifts(), - _name() + : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name() { } CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; -void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst, - const ConvolutionInfo &info, const CPUInfo &cpu_info) +void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info) { ARM_COMPUTE_UNUSED(cpu_info); ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -225,24 +219,30 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, _name = "CpuDepthwiseConv2dAssemblyWrapperKernel"; std::string asm_kernel_name(""); #if defined(__aarch64__) - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { - create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); } else { - create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, + _multipliers, _right_shifts, _left_shifts, + asm_kernel_name); } break; case DataType::QASYMM8_SIGNED: - create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name); + create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, + _right_shifts, _left_shifts, asm_kernel_name); break; #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) case DataType::F16: - create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name); + create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, + asm_kernel_name); break; #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) case DataType::F32: @@ -255,13 +255,17 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, Window win = calculate_max_window(*dst, Steps()); ICpuKernel::configure(win); - if(_kernel_asm != nullptr) + if (_kernel_asm != nullptr) { _name += "/" + asm_kernel_name; } } -Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -269,10 +273,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); #endif // !defined(__aarch64__) ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, + "Only NHWC is supported by assembly kernels"); - if(is_data_type_quantized_per_channel(weights->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); @@ -282,12 +288,12 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -297,7 +303,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, } } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -305,17 +311,15 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, } // Assembly kernels cannot work with padding greater than the kernel. - const auto &padding = info.pad_stride_info; - const auto &dilation = info.dilation; + const auto &padding = info.pad_stride_info; + const auto &dilation = info.dilation; const auto &wei_shape = weights->tensor_shape(); const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1); const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1); - ARM_COMPUTE_RETURN_ERROR_ON( - padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w || - padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h - ); + ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w || + padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h); return Status{}; } @@ -351,13 +355,12 @@ void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, - parameters_ptr, - dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, - working_space, info.thread_id, info.num_threads); + _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row, + ld_dst_batch, working_space, info.thread_id, info.num_threads); } -void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) +void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) { _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); } diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h index f61cb1b09c..fadaefb999 100644 --- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/kernels/CpuKernelSelectionTypes.h" @@ -35,8 +36,8 @@ namespace depthwise { // Forward declarations class IDepthwiseCommon; -} // depthwise -} // arm_conv +} // namespace depthwise +} // namespace arm_conv namespace arm_compute { @@ -66,7 +67,12 @@ public: * @param[in] info Depthwise convolution layer meta-data. * @param[in] cpu_info CPU information needed to select the most appropriate kernel. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info, + const CPUInfo &cpu_info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -74,10 +80,14 @@ public: * * @return a status. */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; /** Pack bias and weights in a storage space for the assembly kernel @@ -88,7 +98,8 @@ public: * @param[in] ld_weights_col Columns displacement for the weights tensor. * @param[in] ld_weights_row Rows displacement for the weights tensor. */ - void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); + void pack_parameters( + void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); /** Get the amount of storage space required for the rearranged weights and bias. * diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp index 10ff4183c0..a161c800fd 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -22,14 +22,16 @@ * SOFTWARE. */ #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" #include <arm_neon.h> @@ -41,7 +43,10 @@ namespace kernels { using namespace arm_compute::misc::shape_calculator; -void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { ARM_COMPUTE_UNUSED(cpu_info); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -52,10 +57,10 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn #if defined(__aarch64__) const bool requantize = src->quantization_info() != dst->quantization_info(); - switch(src->data_type()) + switch (src->data_type()) { case DataType::QASYMM8: - if(requantize) + if (requantize) { create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info); } @@ -65,7 +70,7 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn } break; case DataType::QASYMM8_SIGNED: - if(requantize) + if (requantize) { create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info); } @@ -91,7 +96,8 @@ void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorIn INEKernel::configure(win); } -Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) +Status +CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); @@ -99,43 +105,52 @@ Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const IT ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); #endif /* __aarch64__ */ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), + "Only NHWC is supported by assembly kernels"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX), "Only AVG and MAX pooling are supported by assembly kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + is_pool_region_entirely_outside_input(info), + "Pooling region that is entirely outside input tensor is unsupported by assembly kernels"); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); const auto src_qinfo = src->quantization_info().uniform(); const auto dst_qinfo = dst->quantization_info().uniform(); - if(src_qinfo != dst_qinfo) + if (src_qinfo != dst_qinfo) { const float multiplier = src_qinfo.scale / dst_qinfo.scale; int32_t dst_multiplier{}; int32_t dst_shift{}; - ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); + ARM_COMPUTE_RETURN_ERROR_ON( + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); } else { - if(src->data_type() == DataType::QASYMM8) + if (src->data_type() == DataType::QASYMM8) { const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); } } } else { - if(src->data_type() == DataType::QASYMM8) + if (src->data_type() == DataType::QASYMM8) { // If dst is not configured, the quantization info are the same const bool has_padding = info.pad_stride_info.has_padding(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !info.exclude_padding && has_padding, + "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); } } return Status{}; @@ -154,9 +169,10 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window & ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); - const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); - auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); - auto working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = + (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); const auto src_shape = src->info()->tensor_shape(); const auto dst_shape = dst->info()->tensor_shape(); @@ -170,8 +186,7 @@ void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window & const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; - _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, - out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, working_space, info.thread_id, info.num_threads); } @@ -186,9 +201,14 @@ bool CpuPool2dAssemblyWrapperKernel::is_configured() const } template <typename Typesrc, typename Typedst> -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; arm_conv::pooling::PoolingWindow window{}; window.cols = static_cast<unsigned int>(info.pool_size.x()); @@ -197,7 +217,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; @@ -211,11 +232,12 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, const unsigned int dst_rows = dst->dimension(idx_height); const unsigned int dst_cols = dst->dimension(idx_width); - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); // Configure assembly pooling kernel auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args); - if(pooling_kernel_asm == nullptr) + if (pooling_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; @@ -225,9 +247,14 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, } template <typename Typesrc, typename Typedst> -void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info) { - const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) + ? arm_conv::pooling::PoolingType::AVERAGE + : arm_conv::pooling::PoolingType::MAX; arm_conv::pooling::PoolingWindow window{}; window.cols = static_cast<unsigned int>(info.pool_size.x()); @@ -236,7 +263,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf arm_conv::pooling::PoolingStride stride{}; std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); - const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), + info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()}; constexpr unsigned int idx_width = 1; constexpr unsigned int idx_height = 2; @@ -250,7 +278,8 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf const unsigned int dst_rows = dst->dimension(idx_height); const unsigned int dst_cols = dst->dimension(idx_width); - arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, + src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); const auto src_qinfo = src->quantization_info().uniform(); const auto dst_qinfo = dst->quantization_info().uniform(); @@ -260,15 +289,15 @@ void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInf int32_t dst_shift{}; quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); - const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, - dst_qinfo.offset, + const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset, dst_shift, // left shift 0, // right shift dst_multiplier); // Configure assembly pooling kernel with requantization - auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args); - if(pooling_kernel_asm == nullptr) + auto pooling_kernel_asm = + arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args); + if (pooling_kernel_asm == nullptr) { // Configuration not supported: Leave function unconfigured: return; diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h index 8713d5c54d..b4ff1e6f2d 100644 --- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h @@ -25,8 +25,9 @@ #define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H #include "arm_compute/core/Types.h" -#include "src/core/NEON/kernels/assembly/pooling.hpp" + #include "src/core/common/Macros.h" +#include "src/core/NEON/kernels/assembly/pooling.hpp" #include "src/cpu/ICpuKernel.h" #include "src/cpu/kernels/CpuKernelSelectionTypes.h" @@ -101,7 +102,8 @@ private: * @param[in] info Pooling layer meta-data. */ template <typename Typesrc, typename Typedst> - void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + void + create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); /** Helper function to create the assembly kernel with requantization support * @@ -110,9 +112,12 @@ private: * @param[in] info Pooling layer meta-data. */ template <typename Typesrc, typename Typedst> - void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + void create_arm_pooling_requant(const ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + const CPUInfo &cpu_info); - std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr }; + std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr}; /** Return minimum workload size of the relevant kernel * diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp index 661c3d7f46..6c6527de06 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp @@ -32,13 +32,15 @@ namespace arm_compute { namespace cpu { -void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +void neon_fp16_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) { ARM_COMPUTE_UNUSED(unused_axis); return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window); } -void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void neon_fp16_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis); } diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp index be32bdc4fa..520877068c 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp +++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp @@ -22,21 +22,23 @@ * SOFTWARE. */ -#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" +#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h" + namespace arm_compute { namespace cpu { -void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) +void neon_fp32_l2_normalize_x( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis) { ARM_COMPUTE_UNUSED(unused_axis); return l2_normalize_x<float, 4>(in, sum, out, epsilon, window); } -void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void neon_fp32_l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis); } diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h index a06cdd33d3..6bd19299b7 100644 --- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h +++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" #include <cstddef> @@ -51,33 +52,36 @@ void l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float e Iterator sum_it(sum, win_collapsed); Iterator output_it(out, win_collapsed); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); - - const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr()); - const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon))); - const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); + const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + + const T sum_value = *reinterpret_cast<const T *>(sum_it.ptr()); + const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon))); + const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{}); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); } template <typename T, int S> -void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +void l2_normalize_yz( + const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) { using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; @@ -97,28 +101,30 @@ void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{}); - execute_window_loop(win, [&](const Coordinates &) - { - const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); - const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); - - // Compute elements over vector steps - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); - wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &) { - const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon))); - out_ptr[x] = in_ptr[x] * norm_value; - } - }, - input_it, sum_it, output_it); + const auto in_ptr = reinterpret_cast<const T *>(input_it.ptr()); + const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + + // Compute elements over vector steps + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps)); + wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value)); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon))); + out_ptr[x] = in_ptr[x] * norm_value; + } + }, + input_it, sum_it, output_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h index 2bad7f54f5..e2a879d06e 100644 --- a/src/cpu/kernels/l2normlayer/list.h +++ b/src/cpu/kernels/l2normlayer/list.h @@ -27,8 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_L2NORMLAYER_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis) +#define DECLARE_L2NORMLAYER_KERNEL(func_name) \ + void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \ + size_t axis) DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x); DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz); diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp index 8ab647bfee..5516f5b33d 100644 --- a/src/cpu/kernels/lut/generic/neon/u8.cpp +++ b/src/cpu/kernels/lut/generic/neon/u8.cpp @@ -32,376 +32,374 @@ namespace cpu #ifdef __aarch64__ void lut_u8_neon( - const uint8_t *table, - size_t num_strings, - size_t string_length, - const uint8_t *const *input, - uint8_t *const *output) + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) { - __asm__ __volatile__( - "ldr q16, [%x[table], #0x0]\n" - "ldr q17, [%x[table], #0x10]\n" - "mov x23, #0x0\n" - "ldr q18, [%x[table], #0x20]\n" - "ldr q19, [%x[table], #0x30]\n" - "ldr q20, [%x[table], #0x40]\n" - "ldr q21, [%x[table], #0x50]\n" - "ldr q22, [%x[table], #0x60]\n" - "ldr q23, [%x[table], #0x70]\n" - "ldr q24, [%x[table], #0x80]\n" - "ldr q25, [%x[table], #0x90]\n" - "ldr q26, [%x[table], #0xa0]\n" - "ldr q27, [%x[table], #0xb0]\n" - "ldr q28, [%x[table], #0xc0]\n" - "ldr q29, [%x[table], #0xd0]\n" - "ldr q30, [%x[table], #0xe0]\n" - "ldr q31, [%x[table], #0xf0]\n" - "1:" // string loop - "ldr x22, [%x[input], x23, LSL #0x3]\n" - "ldr x21, [%x[output], x23, LSL #0x3]\n" - "movi v11.16b, #0x40\n" - "movi v10.16b, #0x80\n" - "movi v9.16b, #0xc0\n" - "mov x20, %x[string_length]\n" - "2:" // 4 rounds: width loop - "cmp x20, #0x30\n" - "bge 27f\n" - "tbz x20, #5, 10f\n" - "ld1 { v8.16b }, [x22], #0x10\n" - "ld1 { v13.16b }, [x22], #0x10\n" - "tbz x20, #3, 6f\n" - "ldr d12, [x22], #0x8\n" - "tbz x20, #2, 4f\n" - "ld1 { v12.s }[2], [x22], #0x4\n" - "tbz x20, #1, 3f\n" - "ld1 { v12.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[14], [x22]\n" - "b 26f\n" - "3:" // 4 rounds: Partial load: partial_1_44 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[12], [x22]\n" - "b 26f\n" - "4:" // 4 rounds: Partial load: partial_2_40 - "tbz x20, #1, 5f\n" - "ld1 { v12.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[10], [x22]\n" - "b 26f\n" - "5:" // 4 rounds: Partial load: partial_1_40 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[8], [x22]\n" - "b 26f\n" - "6:" // 4 rounds: Partial load: partial_4_32 - "tbz x20, #2, 8f\n" - "ldr s12, [x22], #0x4\n" - "tbz x20, #1, 7f\n" - "ld1 { v12.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[6], [x22]\n" - "b 26f\n" - "7:" // 4 rounds: Partial load: partial_1_36 - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[4], [x22]\n" - "b 26f\n" - "8:" // 4 rounds: Partial load: partial_2_32 - "tbz x20, #1, 9f\n" - "ldr h12, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v12.b }[2], [x22]\n" - "b 26f\n" - "9:" // 4 rounds: Partial load: partial_1_32 - "tbz x20, #0, 26f\n" - "ldr b12, [x22, #0x0]\n" - "b 26f\n" - "10:" // 4 rounds: Partial load: partial_16_0 - "tbz x20, #4, 18f\n" - "ld1 { v8.16b }, [x22], #0x10\n" - "tbz x20, #3, 14f\n" - "ldr d13, [x22], #0x8\n" - "tbz x20, #2, 12f\n" - "ld1 { v13.s }[2], [x22], #0x4\n" - "tbz x20, #1, 11f\n" - "ld1 { v13.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[14], [x22]\n" - "b 26f\n" - "11:" // 4 rounds: Partial load: partial_1_28 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[12], [x22]\n" - "b 26f\n" - "12:" // 4 rounds: Partial load: partial_2_24 - "tbz x20, #1, 13f\n" - "ld1 { v13.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[10], [x22]\n" - "b 26f\n" - "13:" // 4 rounds: Partial load: partial_1_24 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[8], [x22]\n" - "b 26f\n" - "14:" // 4 rounds: Partial load: partial_4_16 - "tbz x20, #2, 16f\n" - "ldr s13, [x22], #0x4\n" - "tbz x20, #1, 15f\n" - "ld1 { v13.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[6], [x22]\n" - "b 26f\n" - "15:" // 4 rounds: Partial load: partial_1_20 - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[4], [x22]\n" - "b 26f\n" - "16:" // 4 rounds: Partial load: partial_2_16 - "tbz x20, #1, 17f\n" - "ldr h13, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v13.b }[2], [x22]\n" - "b 26f\n" - "17:" // 4 rounds: Partial load: partial_1_16 - "tbz x20, #0, 26f\n" - "ldr b13, [x22, #0x0]\n" - "b 26f\n" - "18:" // 4 rounds: Partial load: partial_8_0 - "tbz x20, #3, 22f\n" - "ldr d8, [x22], #0x8\n" - "tbz x20, #2, 20f\n" - "ld1 { v8.s }[2], [x22], #0x4\n" - "tbz x20, #1, 19f\n" - "ld1 { v8.h }[6], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[14], [x22]\n" - "b 26f\n" - "19:" // 4 rounds: Partial load: partial_1_12 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[12], [x22]\n" - "b 26f\n" - "20:" // 4 rounds: Partial load: partial_2_8 - "tbz x20, #1, 21f\n" - "ld1 { v8.h }[4], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[10], [x22]\n" - "b 26f\n" - "21:" // 4 rounds: Partial load: partial_1_8 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[8], [x22]\n" - "b 26f\n" - "22:" // 4 rounds: Partial load: partial_4_0 - "tbz x20, #2, 24f\n" - "ldr s8, [x22], #0x4\n" - "tbz x20, #1, 23f\n" - "ld1 { v8.h }[2], [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[6], [x22]\n" - "b 26f\n" - "23:" // 4 rounds: Partial load: partial_1_4 - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[4], [x22]\n" - "b 26f\n" - "24:" // 4 rounds: Partial load: partial_2_0 - "tbz x20, #1, 25f\n" - "ldr h8, [x22], #0x2\n" - "tbz x20, #0, 26f\n" - "ld1 { v8.b }[2], [x22]\n" - "b 26f\n" - "25:" // 4 rounds: Partial load: partial_1_0 - "ldr b8, [x22, #0x0]\n" - "26:" // 4 rounds: Partial load: Done - "b 28f\n" - "27:" // 4 rounds: Full load - "ldr q8, [x22, #0x0]\n" - "ldr q13, [x22, #0x10]\n" - "ldr q12, [x22, #0x20]\n" - "add x22, x22, #0x30\n" - "28:" // 4 rounds: Load done - "sub v0.16b, v8.16b, v11.16b\n" - "sub v7.16b, v8.16b, v10.16b\n" - "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n" - "sub v6.16b, v8.16b, v9.16b\n" - "sub v5.16b, v13.16b, v11.16b\n" - "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n" - "sub v4.16b, v13.16b, v10.16b\n" - "sub v3.16b, v13.16b, v9.16b\n" - "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n" - "sub v2.16b, v12.16b, v11.16b\n" - "sub v1.16b, v12.16b, v10.16b\n" - "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n" - "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n" - "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n" - "orr v8.16b, v8.16b, v0.16b\n" - "sub v0.16b, v12.16b, v9.16b\n" - "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n" - "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n" - "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n" - "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n" - "orr v7.16b, v7.16b, v6.16b\n" - "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n" - "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n" - "orr v13.16b, v13.16b, v5.16b\n" - "orr v4.16b, v4.16b, v3.16b\n" - "orr v12.16b, v12.16b, v2.16b\n" - "cmp x20, #0x30\n" - "orr v1.16b, v1.16b, v0.16b\n" - "orr v8.16b, v8.16b, v7.16b\n" - "orr v13.16b, v13.16b, v4.16b\n" - "orr v12.16b, v12.16b, v1.16b\n" - "bge 53f\n" - "tbz x20, #5, 36f\n" - "st1 { v8.16b }, [x21], #0x10\n" - "st1 { v13.16b }, [x21], #0x10\n" - "tbz x20, #3, 32f\n" - "str d12, [x21], #0x8\n" - "tbz x20, #2, 30f\n" - "st1 { v12.s }[2], [x21], #0x4\n" - "tbz x20, #1, 29f\n" - "st1 { v12.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[14], [x21]\n" - "b 52f\n" - "29:" // 4 rounds: Partial writeback: partial_1_44 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[12], [x21]\n" - "b 52f\n" - "30:" // 4 rounds: Partial writeback: partial_2_40 - "tbz x20, #1, 31f\n" - "st1 { v12.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[10], [x21]\n" - "b 52f\n" - "31:" // 4 rounds: Partial writeback: partial_1_40 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[8], [x21]\n" - "b 52f\n" - "32:" // 4 rounds: Partial writeback: partial_4_32 - "tbz x20, #2, 34f\n" - "str s12, [x21], #0x4\n" - "tbz x20, #1, 33f\n" - "st1 { v12.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[6], [x21]\n" - "b 52f\n" - "33:" // 4 rounds: Partial writeback: partial_1_36 - "tbz x20, #0, 52f\n" - "st1 { v12.b }[4], [x21]\n" - "b 52f\n" - "34:" // 4 rounds: Partial writeback: partial_2_32 - "tbz x20, #1, 35f\n" - "str h12, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v12.b }[2], [x21]\n" - "b 52f\n" - "35:" // 4 rounds: Partial writeback: partial_1_32 - "tbz x20, #0, 52f\n" - "str b12, [x21, #0x0]\n" - "b 52f\n" - "36:" // 4 rounds: Partial writeback: partial_16_0 - "tbz x20, #4, 44f\n" - "st1 { v8.16b }, [x21], #0x10\n" - "tbz x20, #3, 40f\n" - "str d13, [x21], #0x8\n" - "tbz x20, #2, 38f\n" - "st1 { v13.s }[2], [x21], #0x4\n" - "tbz x20, #1, 37f\n" - "st1 { v13.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[14], [x21]\n" - "b 52f\n" - "37:" // 4 rounds: Partial writeback: partial_1_28 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[12], [x21]\n" - "b 52f\n" - "38:" // 4 rounds: Partial writeback: partial_2_24 - "tbz x20, #1, 39f\n" - "st1 { v13.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[10], [x21]\n" - "b 52f\n" - "39:" // 4 rounds: Partial writeback: partial_1_24 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[8], [x21]\n" - "b 52f\n" - "40:" // 4 rounds: Partial writeback: partial_4_16 - "tbz x20, #2, 42f\n" - "str s13, [x21], #0x4\n" - "tbz x20, #1, 41f\n" - "st1 { v13.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[6], [x21]\n" - "b 52f\n" - "41:" // 4 rounds: Partial writeback: partial_1_20 - "tbz x20, #0, 52f\n" - "st1 { v13.b }[4], [x21]\n" - "b 52f\n" - "42:" // 4 rounds: Partial writeback: partial_2_16 - "tbz x20, #1, 43f\n" - "str h13, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v13.b }[2], [x21]\n" - "b 52f\n" - "43:" // 4 rounds: Partial writeback: partial_1_16 - "tbz x20, #0, 52f\n" - "str b13, [x21, #0x0]\n" - "b 52f\n" - "44:" // 4 rounds: Partial writeback: partial_8_0 - "tbz x20, #3, 48f\n" - "str d8, [x21], #0x8\n" - "tbz x20, #2, 46f\n" - "st1 { v8.s }[2], [x21], #0x4\n" - "tbz x20, #1, 45f\n" - "st1 { v8.h }[6], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[14], [x21]\n" - "b 52f\n" - "45:" // 4 rounds: Partial writeback: partial_1_12 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[12], [x21]\n" - "b 52f\n" - "46:" // 4 rounds: Partial writeback: partial_2_8 - "tbz x20, #1, 47f\n" - "st1 { v8.h }[4], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[10], [x21]\n" - "b 52f\n" - "47:" // 4 rounds: Partial writeback: partial_1_8 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[8], [x21]\n" - "b 52f\n" - "48:" // 4 rounds: Partial writeback: partial_4_0 - "tbz x20, #2, 50f\n" - "str s8, [x21], #0x4\n" - "tbz x20, #1, 49f\n" - "st1 { v8.h }[2], [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[6], [x21]\n" - "b 52f\n" - "49:" // 4 rounds: Partial writeback: partial_1_4 - "tbz x20, #0, 52f\n" - "st1 { v8.b }[4], [x21]\n" - "b 52f\n" - "50:" // 4 rounds: Partial writeback: partial_2_0 - "tbz x20, #1, 51f\n" - "str h8, [x21], #0x2\n" - "tbz x20, #0, 52f\n" - "st1 { v8.b }[2], [x21]\n" - "b 52f\n" - "51:" // 4 rounds: Partial writeback: partial_1_0 - "str b8, [x21, #0x0]\n" - "52:" // 4 rounds: Partial writeback: Done - "b 54f\n" - "53:" // 4 rounds: Full writeback - "str q8, [x21, #0x0]\n" - "str q13, [x21, #0x10]\n" - "str q12, [x21, #0x20]\n" - "add x21, x21, #0x30\n" - "54:" // 4 rounds: Writeback done - "subs x20, x20, #0x30\n" - "bgt 2b\n" - "add x23, x23, #0x1\n" - "cmp x23, %x[num_strings]\n" - "bne 1b\n" - : - : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"); + __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n" + "ldr q17, [%x[table], #0x10]\n" + "mov x23, #0x0\n" + "ldr q18, [%x[table], #0x20]\n" + "ldr q19, [%x[table], #0x30]\n" + "ldr q20, [%x[table], #0x40]\n" + "ldr q21, [%x[table], #0x50]\n" + "ldr q22, [%x[table], #0x60]\n" + "ldr q23, [%x[table], #0x70]\n" + "ldr q24, [%x[table], #0x80]\n" + "ldr q25, [%x[table], #0x90]\n" + "ldr q26, [%x[table], #0xa0]\n" + "ldr q27, [%x[table], #0xb0]\n" + "ldr q28, [%x[table], #0xc0]\n" + "ldr q29, [%x[table], #0xd0]\n" + "ldr q30, [%x[table], #0xe0]\n" + "ldr q31, [%x[table], #0xf0]\n" + "1:" // string loop + "ldr x22, [%x[input], x23, LSL #0x3]\n" + "ldr x21, [%x[output], x23, LSL #0x3]\n" + "movi v11.16b, #0x40\n" + "movi v10.16b, #0x80\n" + "movi v9.16b, #0xc0\n" + "mov x20, %x[string_length]\n" + "2:" // 4 rounds: width loop + "cmp x20, #0x30\n" + "bge 27f\n" + "tbz x20, #5, 10f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "ld1 { v13.16b }, [x22], #0x10\n" + "tbz x20, #3, 6f\n" + "ldr d12, [x22], #0x8\n" + "tbz x20, #2, 4f\n" + "ld1 { v12.s }[2], [x22], #0x4\n" + "tbz x20, #1, 3f\n" + "ld1 { v12.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[14], [x22]\n" + "b 26f\n" + "3:" // 4 rounds: Partial load: partial_1_44 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[12], [x22]\n" + "b 26f\n" + "4:" // 4 rounds: Partial load: partial_2_40 + "tbz x20, #1, 5f\n" + "ld1 { v12.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[10], [x22]\n" + "b 26f\n" + "5:" // 4 rounds: Partial load: partial_1_40 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[8], [x22]\n" + "b 26f\n" + "6:" // 4 rounds: Partial load: partial_4_32 + "tbz x20, #2, 8f\n" + "ldr s12, [x22], #0x4\n" + "tbz x20, #1, 7f\n" + "ld1 { v12.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[6], [x22]\n" + "b 26f\n" + "7:" // 4 rounds: Partial load: partial_1_36 + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[4], [x22]\n" + "b 26f\n" + "8:" // 4 rounds: Partial load: partial_2_32 + "tbz x20, #1, 9f\n" + "ldr h12, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v12.b }[2], [x22]\n" + "b 26f\n" + "9:" // 4 rounds: Partial load: partial_1_32 + "tbz x20, #0, 26f\n" + "ldr b12, [x22, #0x0]\n" + "b 26f\n" + "10:" // 4 rounds: Partial load: partial_16_0 + "tbz x20, #4, 18f\n" + "ld1 { v8.16b }, [x22], #0x10\n" + "tbz x20, #3, 14f\n" + "ldr d13, [x22], #0x8\n" + "tbz x20, #2, 12f\n" + "ld1 { v13.s }[2], [x22], #0x4\n" + "tbz x20, #1, 11f\n" + "ld1 { v13.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[14], [x22]\n" + "b 26f\n" + "11:" // 4 rounds: Partial load: partial_1_28 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[12], [x22]\n" + "b 26f\n" + "12:" // 4 rounds: Partial load: partial_2_24 + "tbz x20, #1, 13f\n" + "ld1 { v13.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[10], [x22]\n" + "b 26f\n" + "13:" // 4 rounds: Partial load: partial_1_24 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[8], [x22]\n" + "b 26f\n" + "14:" // 4 rounds: Partial load: partial_4_16 + "tbz x20, #2, 16f\n" + "ldr s13, [x22], #0x4\n" + "tbz x20, #1, 15f\n" + "ld1 { v13.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[6], [x22]\n" + "b 26f\n" + "15:" // 4 rounds: Partial load: partial_1_20 + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[4], [x22]\n" + "b 26f\n" + "16:" // 4 rounds: Partial load: partial_2_16 + "tbz x20, #1, 17f\n" + "ldr h13, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v13.b }[2], [x22]\n" + "b 26f\n" + "17:" // 4 rounds: Partial load: partial_1_16 + "tbz x20, #0, 26f\n" + "ldr b13, [x22, #0x0]\n" + "b 26f\n" + "18:" // 4 rounds: Partial load: partial_8_0 + "tbz x20, #3, 22f\n" + "ldr d8, [x22], #0x8\n" + "tbz x20, #2, 20f\n" + "ld1 { v8.s }[2], [x22], #0x4\n" + "tbz x20, #1, 19f\n" + "ld1 { v8.h }[6], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[14], [x22]\n" + "b 26f\n" + "19:" // 4 rounds: Partial load: partial_1_12 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[12], [x22]\n" + "b 26f\n" + "20:" // 4 rounds: Partial load: partial_2_8 + "tbz x20, #1, 21f\n" + "ld1 { v8.h }[4], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[10], [x22]\n" + "b 26f\n" + "21:" // 4 rounds: Partial load: partial_1_8 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[8], [x22]\n" + "b 26f\n" + "22:" // 4 rounds: Partial load: partial_4_0 + "tbz x20, #2, 24f\n" + "ldr s8, [x22], #0x4\n" + "tbz x20, #1, 23f\n" + "ld1 { v8.h }[2], [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[6], [x22]\n" + "b 26f\n" + "23:" // 4 rounds: Partial load: partial_1_4 + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[4], [x22]\n" + "b 26f\n" + "24:" // 4 rounds: Partial load: partial_2_0 + "tbz x20, #1, 25f\n" + "ldr h8, [x22], #0x2\n" + "tbz x20, #0, 26f\n" + "ld1 { v8.b }[2], [x22]\n" + "b 26f\n" + "25:" // 4 rounds: Partial load: partial_1_0 + "ldr b8, [x22, #0x0]\n" + "26:" // 4 rounds: Partial load: Done + "b 28f\n" + "27:" // 4 rounds: Full load + "ldr q8, [x22, #0x0]\n" + "ldr q13, [x22, #0x10]\n" + "ldr q12, [x22, #0x20]\n" + "add x22, x22, #0x30\n" + "28:" // 4 rounds: Load done + "sub v0.16b, v8.16b, v11.16b\n" + "sub v7.16b, v8.16b, v10.16b\n" + "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n" + "sub v6.16b, v8.16b, v9.16b\n" + "sub v5.16b, v13.16b, v11.16b\n" + "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n" + "sub v4.16b, v13.16b, v10.16b\n" + "sub v3.16b, v13.16b, v9.16b\n" + "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n" + "sub v2.16b, v12.16b, v11.16b\n" + "sub v1.16b, v12.16b, v10.16b\n" + "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n" + "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n" + "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n" + "orr v8.16b, v8.16b, v0.16b\n" + "sub v0.16b, v12.16b, v9.16b\n" + "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n" + "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n" + "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n" + "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n" + "orr v7.16b, v7.16b, v6.16b\n" + "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n" + "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n" + "orr v13.16b, v13.16b, v5.16b\n" + "orr v4.16b, v4.16b, v3.16b\n" + "orr v12.16b, v12.16b, v2.16b\n" + "cmp x20, #0x30\n" + "orr v1.16b, v1.16b, v0.16b\n" + "orr v8.16b, v8.16b, v7.16b\n" + "orr v13.16b, v13.16b, v4.16b\n" + "orr v12.16b, v12.16b, v1.16b\n" + "bge 53f\n" + "tbz x20, #5, 36f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "st1 { v13.16b }, [x21], #0x10\n" + "tbz x20, #3, 32f\n" + "str d12, [x21], #0x8\n" + "tbz x20, #2, 30f\n" + "st1 { v12.s }[2], [x21], #0x4\n" + "tbz x20, #1, 29f\n" + "st1 { v12.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[14], [x21]\n" + "b 52f\n" + "29:" // 4 rounds: Partial writeback: partial_1_44 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[12], [x21]\n" + "b 52f\n" + "30:" // 4 rounds: Partial writeback: partial_2_40 + "tbz x20, #1, 31f\n" + "st1 { v12.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[10], [x21]\n" + "b 52f\n" + "31:" // 4 rounds: Partial writeback: partial_1_40 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[8], [x21]\n" + "b 52f\n" + "32:" // 4 rounds: Partial writeback: partial_4_32 + "tbz x20, #2, 34f\n" + "str s12, [x21], #0x4\n" + "tbz x20, #1, 33f\n" + "st1 { v12.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[6], [x21]\n" + "b 52f\n" + "33:" // 4 rounds: Partial writeback: partial_1_36 + "tbz x20, #0, 52f\n" + "st1 { v12.b }[4], [x21]\n" + "b 52f\n" + "34:" // 4 rounds: Partial writeback: partial_2_32 + "tbz x20, #1, 35f\n" + "str h12, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v12.b }[2], [x21]\n" + "b 52f\n" + "35:" // 4 rounds: Partial writeback: partial_1_32 + "tbz x20, #0, 52f\n" + "str b12, [x21, #0x0]\n" + "b 52f\n" + "36:" // 4 rounds: Partial writeback: partial_16_0 + "tbz x20, #4, 44f\n" + "st1 { v8.16b }, [x21], #0x10\n" + "tbz x20, #3, 40f\n" + "str d13, [x21], #0x8\n" + "tbz x20, #2, 38f\n" + "st1 { v13.s }[2], [x21], #0x4\n" + "tbz x20, #1, 37f\n" + "st1 { v13.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[14], [x21]\n" + "b 52f\n" + "37:" // 4 rounds: Partial writeback: partial_1_28 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[12], [x21]\n" + "b 52f\n" + "38:" // 4 rounds: Partial writeback: partial_2_24 + "tbz x20, #1, 39f\n" + "st1 { v13.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[10], [x21]\n" + "b 52f\n" + "39:" // 4 rounds: Partial writeback: partial_1_24 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[8], [x21]\n" + "b 52f\n" + "40:" // 4 rounds: Partial writeback: partial_4_16 + "tbz x20, #2, 42f\n" + "str s13, [x21], #0x4\n" + "tbz x20, #1, 41f\n" + "st1 { v13.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[6], [x21]\n" + "b 52f\n" + "41:" // 4 rounds: Partial writeback: partial_1_20 + "tbz x20, #0, 52f\n" + "st1 { v13.b }[4], [x21]\n" + "b 52f\n" + "42:" // 4 rounds: Partial writeback: partial_2_16 + "tbz x20, #1, 43f\n" + "str h13, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v13.b }[2], [x21]\n" + "b 52f\n" + "43:" // 4 rounds: Partial writeback: partial_1_16 + "tbz x20, #0, 52f\n" + "str b13, [x21, #0x0]\n" + "b 52f\n" + "44:" // 4 rounds: Partial writeback: partial_8_0 + "tbz x20, #3, 48f\n" + "str d8, [x21], #0x8\n" + "tbz x20, #2, 46f\n" + "st1 { v8.s }[2], [x21], #0x4\n" + "tbz x20, #1, 45f\n" + "st1 { v8.h }[6], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[14], [x21]\n" + "b 52f\n" + "45:" // 4 rounds: Partial writeback: partial_1_12 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[12], [x21]\n" + "b 52f\n" + "46:" // 4 rounds: Partial writeback: partial_2_8 + "tbz x20, #1, 47f\n" + "st1 { v8.h }[4], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[10], [x21]\n" + "b 52f\n" + "47:" // 4 rounds: Partial writeback: partial_1_8 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[8], [x21]\n" + "b 52f\n" + "48:" // 4 rounds: Partial writeback: partial_4_0 + "tbz x20, #2, 50f\n" + "str s8, [x21], #0x4\n" + "tbz x20, #1, 49f\n" + "st1 { v8.h }[2], [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[6], [x21]\n" + "b 52f\n" + "49:" // 4 rounds: Partial writeback: partial_1_4 + "tbz x20, #0, 52f\n" + "st1 { v8.b }[4], [x21]\n" + "b 52f\n" + "50:" // 4 rounds: Partial writeback: partial_2_0 + "tbz x20, #1, 51f\n" + "str h8, [x21], #0x2\n" + "tbz x20, #0, 52f\n" + "st1 { v8.b }[2], [x21]\n" + "b 52f\n" + "51:" // 4 rounds: Partial writeback: partial_1_0 + "str b8, [x21, #0x0]\n" + "52:" // 4 rounds: Partial writeback: Done + "b 54f\n" + "53:" // 4 rounds: Full writeback + "str q8, [x21, #0x0]\n" + "str q13, [x21, #0x10]\n" + "str q12, [x21, #0x20]\n" + "add x21, x21, #0x30\n" + "54:" // 4 rounds: Writeback done + "subs x20, x20, #0x30\n" + "bgt 2b\n" + "add x23, x23, #0x1\n" + "cmp x23, %x[num_strings]\n" + "bne 1b\n" + : + : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), + [string_length] "r"(string_length), [table] "r"(table) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", + "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23"); } #endif // __aarch64__ diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp index b80d75326e..ee8572703e 100644 --- a/src/cpu/kernels/lut/generic/sve2/u8.cpp +++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp @@ -32,11 +32,7 @@ namespace arm_compute namespace cpu { void lut_u8_sve2( - const uint8_t *table, - size_t num_strings, - size_t string_length, - const uint8_t *const *input, - uint8_t *const *output) + const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output) { __asm__ __volatile__( "ptrue p0.b\n" @@ -636,7 +632,9 @@ void lut_u8_sve2( "bne 2b\n" : [table] "+&r"(table) : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length) - : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", + "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", + "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); } } // namespace cpu diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h index 7a2afc6927..da90346267 100644 --- a/src/cpu/kernels/lut/list.h +++ b/src/cpu/kernels/lut/list.h @@ -34,13 +34,9 @@ namespace cpu { #ifdef __aarch64__ -#define DECLARE_LUT_KERNEL(func_name) \ - void func_name( \ - const uint8_t *table, \ - size_t num_strings, \ - size_t string_length, \ - const uint8_t *const *input, \ - uint8_t *const *output) +#define DECLARE_LUT_KERNEL(func_name) \ + void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \ + uint8_t *const *output) DECLARE_LUT_KERNEL(lut_u8_neon); DECLARE_LUT_KERNEL(lut_u8_sve2); diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h index 5fe19c4707..73a5b86a2f 100644 --- a/src/cpu/kernels/maxunpool/generic/neon/impl.h +++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -37,13 +38,15 @@ void max_unpooling(const ITensor *input, const ITensor *indices, ITensor *output Iterator indices_itr(indices, window); auto out_ptr = reinterpret_cast<T *>(output->buffer()); const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]); - execute_window_loop(window, [&](const Coordinates & id) - { - auto vindices = reinterpret_cast<uint32_t *>(indices_itr.ptr()); - auto vinput = reinterpret_cast<T *>(input_itr.ptr()); - out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; - }, - input_itr, indices_itr); + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto vindices = reinterpret_cast<uint32_t *>(indices_itr.ptr()); + auto vinput = reinterpret_cast<T *>(input_itr.ptr()); + out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput; + }, + input_itr, indices_itr); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp index 96e4030268..6470f391e2 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp @@ -23,9 +23,9 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/CpuTypes.h" +#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" namespace arm_compute { @@ -45,64 +45,66 @@ void mean_stddev_normalization<float16_t, 8>(ITensor *input, ITensor *output, fl Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr()); - auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const float16_t *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr()); - float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f)); - float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); + float16x8_t sum_vec = vdupq_n_f16(static_cast<float16_t>(0.0f)); + float32x4_t sum_sq_vec = vdupq_n_f32(0.0f); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - sum_vec = vaddq_f16(sum_vec, data); - float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); - float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); - sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + sum_vec = vaddq_f16(sum_vec, data); + float32x4_t dl = vcvt_f32_f16(vget_low_f16(data)); + float32x4_t dh = vcvt_f32_f16(vget_high_f16(data)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl)); + sum_sq_vec = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh)); + } - float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec)); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); + sum_carry_res = vpadd_f16(sum_carry_res, sum_carry_res); - float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); - sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); + float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec); + sum_sq_carry_res = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res); - float16_t sum = vget_lane_f16(sum_carry_res, 0); - float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); + float16_t sum = vget_lane_f16(sum_carry_res, 0); + float sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float16_t data = *(in_ptr + x); - sum += data; - float fdata = static_cast<float>(data); - sum_sq += fdata * fdata; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + float16_t data = *(in_ptr + x); + sum += data; + float fdata = static_cast<float>(data); + sum_sq += fdata * fdata; + } - float16_t mean = sum / input->info()->dimension(0); - float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon)); + float16_t mean = sum / input->info()->dimension(0); + float var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon)); - float16x8_t mean_vec = vdupq_n_f16(mean); - float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); + float16x8_t mean_vec = vdupq_n_f16(mean); + float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - float16x8_t data = vld1q_f16(in_ptr + x); - float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); - // Store results - vst1q_f16(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + float16x8_t data = vld1q_f16(in_ptr + x); + float16x8_t res = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec); + // Store results + vst1q_f16(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp index 0522d6e277..11f6294a35 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -45,60 +46,62 @@ void mean_stddev_normalization(ITensor *input, ITensor *output, float epsilon, c Iterator input_itr(input, win); Iterator output_itr(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast<const ScalarType *>(input_itr.ptr()); - auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const ScalarType *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr()); - auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); - auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); + auto sum_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); + auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{}); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - sum_vec = wrapper::vadd(sum_vec, data); - sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + sum_vec = wrapper::vadd(sum_vec, data); + sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data)); + } - auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); - auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); - for(int i = 0; i < size / 4; ++i) - { - sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); - sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); - } + auto sum_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec)); + auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec)); + for (int i = 0; i < size / 4; ++i) + { + sum_carry_res = wrapper::vpadd(sum_carry_res, sum_carry_res); + sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res); + } - auto sum = wrapper::vgetlane(sum_carry_res, 0); - auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); + auto sum = wrapper::vgetlane(sum_carry_res, 0); + auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - ScalarType data = *(in_ptr + x); - sum += data; - sum_sq += data * data; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + ScalarType data = *(in_ptr + x); + sum += data; + sum_sq += data * data; + } - ScalarType mean = sum / input->info()->dimension(0); - ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); - ScalarType stddev_inv = 1.f / sqrt(var + epsilon); + ScalarType mean = sum / input->info()->dimension(0); + ScalarType var = (sum_sq / input->info()->dimension(0)) - (mean * mean); + ScalarType stddev_inv = 1.f / sqrt(var + epsilon); - auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - auto data = wrapper::vloadq(in_ptr + x); - auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); - // Store results - wrapper::vstore(out_ptr + x, res); - } - for(; x < window_end_x; ++x) - { - *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; - } - }, - input_itr, output_itr); + auto mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{}); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto data = wrapper::vloadq(in_ptr + x); + auto res = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec); + // Store results + wrapper::vstore(out_ptr + x, res); + } + for (; x < window_end_x; ++x) + { + *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv; + } + }, + input_itr, output_itr); } template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window); } // namespace cpu diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp index 53af1e4b16..32654df5dc 100644 --- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -69,77 +70,76 @@ void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const float32x4_t quant_min_vec = vdupq_n_f32(0.0f); execute_window_loop( - win, [&](const Coordinates &) - { - int x = window_start_x; - auto in_ptr = reinterpret_cast<const uint8_t *>(input_itr.ptr()); - auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr()); + win, + [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast<const uint8_t *>(input_itr.ptr()); + auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr()); - uint32x4_t sum_vec = vdupq_n_u32(0); - uint32x4_t sum_sq_vec = vdupq_n_u32(0); + uint32x4_t sum_vec = vdupq_n_u32(0); + uint32x4_t sum_sq_vec = vdupq_n_u32(0); - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); - const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); - const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); - sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); - } + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); + const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); + const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); + sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); + } #ifdef __aarch64__ - sum_vec = vpaddq_u32(sum_vec, sum_vec); - sum_vec = vpaddq_u32(sum_vec, sum_vec); - uint32_t sum = vgetq_lane_u32(sum_vec, 0); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + uint32_t sum = vgetq_lane_u32(sum_vec, 0); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); #elif __arm__ // #ifdef __aarch64__ - uint32_t sum = vgetq_lane_u32(sum_vec, 0) + - vgetq_lane_u32(sum_vec, 1) + - vgetq_lane_u32(sum_vec, 2) + - vgetq_lane_u32(sum_vec, 3); + uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) + + vgetq_lane_u32(sum_vec, 3); - uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + - vgetq_lane_u32(sum_sq_vec, 1) + - vgetq_lane_u32(sum_sq_vec, 2) + - vgetq_lane_u32(sum_sq_vec, 3); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) + + vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3); #endif // #ifdef __aarch64__ - for(; x < window_end_x; ++x) - { - auto data = static_cast<uint32_t>(*(in_ptr + x)); - sum += data; - sum_sq += (data * data); - } + for (; x < window_end_x; ++x) + { + auto data = static_cast<uint32_t>(*(in_ptr + x)); + sum += data; + sum_sq += (data * data); + } - const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0))); - const float var = (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean); - const float stdev_inv = 1.0f / sqrtf(var + epsilon); - const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); - const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); - for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) - { - const uint8x16_t data = vld1q_u8(in_ptr + x); - float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); - float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); - float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); - db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); - db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); - db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); - db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); - const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); - vst1q_u8(out_ptr + x, out); - } + const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0))); + const float var = + (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean); + const float stdev_inv = 1.0f / sqrtf(var + epsilon); + const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); + const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); + for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); + float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); + db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); + db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); + db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); + db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); + const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); + vst1q_u8(out_ptr + x, out); + } - for(; x < window_end_x; ++x) - { - auto data = static_cast<float32_t>(*(in_ptr + x)); - const uint8_t res = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); - *(out_ptr + x) = res; - } - }, - input_itr, output_itr); + for (; x < window_end_x; ++x) + { + auto data = static_cast<float32_t>(*(in_ptr + x)); + const uint8_t res = + data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); + *(out_ptr + x) = res; + } + }, + input_itr, output_itr); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp index 4e15d3ad3f..4af59c2ad4 100644 --- a/src/cpu/kernels/pool2d/neon/fp16.cpp +++ b/src/cpu/kernels/pool2d/neon/fp16.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) @@ -37,7 +38,12 @@ namespace cpu { namespace { -void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_f16_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); @@ -53,8 +59,8 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int pad_right = src->info()->padding().right; @@ -63,97 +69,114 @@ void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off; - const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off; - const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off; - const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off; - const auto v_x0 = vld1q_f16(in_x0_ptr); - const auto v_x1 = vld1q_f16(in_x1_ptr); - const auto v_x2 = vld1q_f16(in_x2_ptr); - const auto v_x3 = vld1q_f16(in_x3_ptr); - float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); - // Store result - vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; - const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); - const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; - const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); - const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; - const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); - const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; - const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); - const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); - const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); - const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); - const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); - const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); - // Store indicies - vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0); - vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off); - float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; - const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); -} -} + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off; + const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off; + const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off; + const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off; + const auto v_x0 = vld1q_f16(in_x0_ptr); + const auto v_x1 = vld1q_f16(in_x1_ptr); + const auto v_x2 = vld1q_f16(in_x2_ptr); + const auto v_x3 = vld1q_f16(in_x3_ptr); + float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7}; + const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); + const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7}; + const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); + const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7}; + const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); + const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7}; + const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); + const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); + const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); + const uint16x8_t tmp_indices2 = + vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); + const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); + const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); + // Store indicies + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0); + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1); + } -void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off); + float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - + pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +} // namespace + +void poolingMxN_fp16_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -167,151 +190,172 @@ void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, Iterator in(src, window_src); Iterator out(dst0, window_out); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); const float16_t min_value = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); float16x8_t vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x8_t scale_v = vdupq_n_f16(scale); - - // Perform pooling - vres = vdupq_n_f16(0.0f); - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + + // Perform pooling + vres = vdupq_n_f16(0.0f); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f16(vres, vmulq_f16(data, data)); + const float16x8_t data = vld1q_f16( + reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } } - else + } + // Divide by scale + vres = vmulq_f16(vres, scale_v); + } + else + { + vres = vdupq_n_f16(min_value); + + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f16(vres, data); + const float16x8_t data = vld1q_f16( + reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f16(vres, data); } } } - // Divide by scale - vres = vmulq_f16(vres, scale_v); - } - else - { - vres = vdupq_n_f16(min_value); - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f16(vres, data); - } + float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); + vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal)); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); - vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); } - // Store result - vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float16_t res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - // Calculate scale - const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float16_t res = 0.0f; - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data * data; + const float data = + *(reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } } - else + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data; + const float16_t data = + *(reinterpret_cast<const float16_t *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); } } } - // Divide by scale - res *= scale; - } - else - { - res = min_value; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } + res = std::sqrt(res); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; } - - // Store result - *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; - } - }, - in, out); + }, + in, out); } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp index a400f3a95d..aaa37863cb 100644 --- a/src/cpu/kernels/pool2d/neon/fp32.cpp +++ b/src/cpu/kernels/pool2d/neon/fp32.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute @@ -34,7 +35,12 @@ namespace cpu { namespace { -void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_f32_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { const int window_start_x = window.x().start(); const int window_end_x = window.x().end(); @@ -50,8 +56,8 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); float32x4_t vres; @@ -63,89 +69,102 @@ void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *ds const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - - const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); - const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z()); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset); - const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset); - const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset); - const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset); - const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); - const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); - const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); - const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); - vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); - // Store result - vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); - - const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; - const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; - const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; - const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; - const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); - const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); - const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); - - // Store indices - vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off); - const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off); - const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off); - const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off); - res = std::max(std::max(x2, x3), std::max(x0, x1)); - - // Store result - *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; - - const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); - const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; - const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t offset_x2 = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; - const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; - const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; - const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; - const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; - - // Store indices - *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; - } - }, - in, out, indices); + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + + const int in_x0_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x2_offset = + (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x3_offset = + (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset); + const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset); + const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset); + const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset); + const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); + const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); + const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); + const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); + vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32x4_t voffset_x0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3}; + const uint32x4_t voffset_x1 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3}; + const uint32x4_t voffset_x2 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3}; + const uint32x4_t voffset_x3 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3}; + const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); + const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); + const uint32x4_t tmp_indices2 = + vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); + + // Store indices + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2); + } + + // Left-overs loop + for (; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off); + res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, + pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = + offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); } } // namespace -void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window) +void poolingMxN_fp32_neon_nhwc_kernel_indices( + const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window) { - const int window_start_x = window.x().start(); - const int window_end_x = window.x().end(); + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); constexpr int window_step_x = 4; Window window_out = window; @@ -160,8 +179,8 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); @@ -169,9 +188,9 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, float32x4_t vres; uint32x4_t vidx; - constexpr int idx_width = 1; - constexpr int idx_height = 2; - constexpr int idx_batch = 3; + constexpr int idx_width = 1; + constexpr int idx_height = 2; + constexpr int idx_batch = 3; const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y()); const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z()); @@ -182,89 +201,97 @@ void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + const int idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int pool_start_x = std::max(0, -idx_width); - const int pool_start_y = std::max(0, -idx_height); + const int pool_start_x = std::max(0, -idx_width); + const int pool_start_y = std::max(0, -idx_height); - const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width); - const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height); + const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width); + const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height); - const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride; - const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride); - const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride); + const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride); + const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) - { - vres = vdupq_n_f32(min_value); - vidx = vdupq_n_u32(0U); - const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; - uint32_t curr_kernel_index = pool_size_x * pool_start_y; - for(int y = pool_start_y; y < pool_end_y; ++y) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); - curr_kernel_index += pool_start_x; - for(int x = pool_start_x; x < pool_end_x; ++x) + vres = vdupq_n_f32(min_value); + vidx = vdupq_n_u32(0U); + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + uint32_t curr_kernel_index = pool_size_x * pool_start_y; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x)); - const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index); - const uint32x4_t idxMask = vcgtq_f32(data, vres); - vidx = vbslq_u32(idxMask, vidx_curr, vidx); - vres = vmaxq_f32(vres, data); - in_ptr_x += y_stride; - curr_kernel_index++; + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + curr_kernel_index += pool_start_x; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x)); + const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index); + const uint32x4_t idxMask = vcgtq_f32(data, vres); + vidx = vbslq_u32(idxMask, vidx_curr, vidx); + vres = vmaxq_f32(vres, data); + in_ptr_x += y_stride; + curr_kernel_index++; + } + curr_kernel_index += (pool_size_x - pool_end_x); + in_ptr_y += z_stride; } - curr_kernel_index += (pool_size_x - pool_end_x); - in_ptr_y += z_stride; + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx); } - // Store result - vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); - vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float res = min_value; - uint32_t idx = 0U; - const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); - for(int x = pool_start_x; x < pool_end_x; ++x) + float res = min_value; + uint32_t idx = 0U; + const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float data = *(reinterpret_cast<const float *>(in_ptr_x)); - if(data > res) + const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float)); + for (int x = pool_start_x; x < pool_end_x; ++x) { - idx = pool_size_x * y + x; - res = data; + const float data = *(reinterpret_cast<const float *>(in_ptr_x)); + if (data > res) + { + idx = pool_size_x * y + x; + res = data; + } + in_ptr_x += y_stride; } - in_ptr_x += y_stride; + in_ptr_y += z_stride; } - in_ptr_y += z_stride; - } - // Store result - *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; - *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx; - } - }, - out, indices); + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx; + } + }, + out, indices); } -void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp32_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr)) + if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr)) { poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window); } - else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr)) + else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && + !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr)) { pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); } @@ -280,153 +307,174 @@ void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, Iterator in(src, window_src); Iterator out(dst0, window_out); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = + pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = + pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); float32x4_t vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x4_t scale_v = vdupq_n_f32(scale); + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); - // Perform pooling - vres = vdupq_n_f32(0.0f); + // Perform pooling + vres = vdupq_n_f32(0.0f); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) - { - vres = vmlaq_f32(vres, data, data); - } - else + for (int x = pool_start_x; x < pool_end_x; ++x) { - vres = vaddq_f32(vres, data); + const float32x4_t data = vld1q_f32( + reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } } } + // Divide by scale + vres = vmulq_f32(vres, scale_v); } - // Divide by scale - vres = vmulq_f32(vres, scale_v); - } - else - { - vres = vdupq_n_f32(min_value); - for(int y = pool_start_y; y < pool_end_y; ++y) + else { - for(int x = pool_start_x; x < pool_end_x; ++x) + vres = vdupq_n_f32(min_value); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - vres = vmaxq_f32(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32( + reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = vmaxq_f32(vres, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))), - static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))), - static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))), - static_cast<float>(sqrt(vgetq_lane_f32(vres, 3))) - }; - vres = l2_res; - } - - // Store result - vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))}; + vres = l2_res; + } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - float res = 0.0f; + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + } - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float res = 0.0f; - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Get power of 2 in case of l2 pooling and accumulate - if(pool_info.pool_type == PoolingType::L2) + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data * data; + const float data = + *(reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if (pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } } - else + } + + // Divide by scale + res *= scale; + } + else + { + res = min_value; + for (int y = pool_start_y; y < pool_end_y; ++y) + { + for (int x = pool_start_x; x < pool_end_x; ++x) { - res += data; + const float data = + *(reinterpret_cast<const float *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); } } } - // Divide by scale - res *= scale; - } - else - { - res = min_value; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) { - for(int x = pool_start_x; x < pool_end_x; ++x) - { - const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); - } + res = std::sqrt(res); } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; } - - // Store result - *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; - } - }, - in, out); + }, + in, out); } } } // namespace cpu diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h index eb141d6fcd..f8f458a63e 100644 --- a/src/cpu/kernels/pool2d/neon/list.h +++ b/src/cpu/kernels/pool2d/neon/list.h @@ -26,16 +26,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/pool2d/neon/quantized.h" + #include <arm_neon.h> namespace arm_compute { namespace cpu { -#define DECLARE_POOLING_KERNEL(func_name) \ - void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window) +#define DECLARE_POOLING_KERNEL(func_name) \ + void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \ + const Window &window) DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc); DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc); @@ -65,7 +68,12 @@ T get_initial_min(bool use_inf_as_limit) } template <typename T> -inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout) +inline uint32_t offset_no_padding(uint32_t padded_offset, + const Coordinates &id, + const ITensorInfo &info, + int pool_stride_x, + int pool_stride_y, + DataLayout data_layout) { const int pad_left = info.padding().left; const int pad_right = info.padding().right; @@ -76,22 +84,24 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const int pad_horiz = pad_left + pad_right; const int pad_vert = pad_top + pad_bottom; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ - - pad_top * sizeof(T) /* top padding */ - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ - - in_stride_w * id[3]; + const uint32_t offset_base = + padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ + - pad_top * sizeof(T) /* top padding */ + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - + pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ + - in_stride_w * id[3]; return offset_base; } else { - const uint32_t offset_base = padded_offset - - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row - - pad_top * sizeof(T) // top padding - - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems + const uint32_t offset_base = padded_offset - + sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row + - pad_top * sizeof(T) // top padding + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * + pool_stride_y // for each Z plane there are width*pad_right padding elems - in_stride_w * id[3]; return offset_base; @@ -100,4 +110,4 @@ inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
\ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp index c342b96426..ee4a67b0fb 100644 --- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp +++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp @@ -25,9 +25,11 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" + #include <limits> #ifdef ENABLE_NCHW_KERNELS @@ -38,15 +40,19 @@ namespace cpu #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr) #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ - (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) -#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ - ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) + (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) \ + : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) +#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ + ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \ + ? vdup_n_f32(fval) \ + : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \ vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \ READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval)) -float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval) +float32x4x2_t +read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval) { float32x4x2_t vec; vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval); @@ -56,13 +62,14 @@ float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval) +float16x4_t +read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval) { float16_t vec[4]; const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 4; i++) + for (int i = 0; i < 4; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -74,94 +81,106 @@ float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, return wrapper::vload(vec); } -void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling3_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f; - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); - const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value); - float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value); - float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f; + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) { - top_data = vmul_f16(top_data, top_data); - middle_data = vmul_f16(middle_data, middle_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + float16x4_t top_data = + read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value); + float16x4_t middle_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16( + src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2, + reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value); + float16x4_t res = {}; - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); - // Perform pooling - const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); - res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); - res = vmul_f16(vpadd_f16(res, res), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); - res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data); - res = vpmax_f16(res, res); - } + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + middle_data = vmul_f16(middle_data, middle_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vsqrt_f16(res); - } + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + // Perform pooling + const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); + res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); + res = vmul_f16(vpadd_f16(res, res), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); + res = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data); + res = vpmax_f16(res, res); + } + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } - *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); } template <typename T> -inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type -f16_to_f32(float16x4_t in) +inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type f16_to_f32(float16x4_t in) { - float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) }; + float32x2_t out = {static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1))}; return out; } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template <typename T> -inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type -f16_to_f32(float32x2_t in) +inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type f16_to_f32(float32x2_t in) { return in; } @@ -171,9 +190,9 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int { T vec[2]; const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 2; i++) + for (int i = 0; i < 2; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -186,61 +205,80 @@ auto read_2_boundary_aware(int srcw, int srch, int pad_l, int pad_t, int x, int } template <typename T> -void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_nchw_maxpool_indices(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { Iterator in(src, window_src); Iterator out(dst0, window); Iterator indices(dst1, window); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); - const int pad_left = src->info()->padding().left; - const int pad_right = src->info()->padding().right; - const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); - const T float_min = get_initial_min<T>(pool_info.use_inf_as_limit); - const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f; - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); - auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); - float32x2_t top_data_f32 = f16_to_f32<T>(top_data); - float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data); - - // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. - const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); - const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); - const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); - *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0)); - - // Calculate max data indice, which will be used in max unpool. - const uint32_t offset_base = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); - const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); - const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; - const uint32x2_t voffset_top = { offset_top, offset_top + 1u }; - const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u }; - const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); - const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom)); - *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); - }, - in, out, indices); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const int pad_left = src->info()->padding().left; + const int pad_right = src->info()->padding().right; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const T float_min = get_initial_min<T>(pool_info.use_inf_as_limit); + const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f; + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0, + reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1, + reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + float32x2_t top_data_f32 = f16_to_f32<T>(top_data); + float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data); + + // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. + const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); + const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); + const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); + *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0)); + + // Calculate max data indice, which will be used in max unpool. + const uint32_t offset_base = + offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); + const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); + const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; + const uint32x2_t voffset_top = {offset_top, offset_top + 1u}; + const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u}; + const uint32x2_t tmp_indices_top = + vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); + const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), + voffset_bottom, vrev64_u32(voffset_bottom)); + *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32( + vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); + }, + in, out, indices); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window); } @@ -254,244 +292,274 @@ void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P const int pool_pad_left = pool_info.pad_stride_info.pad_left(); const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); int pool_stride_x, pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; - - const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); - const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()); - const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()); - - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_0, in_top_ptr, fill_value); - float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, - x_val, y_val_1, in_bottom_ptr, fill_value); - float16x4_t res = {}; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + const unsigned char *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) { - top_data = vmul_f16(top_data, top_data); - bottom_data = vmul_f16(bottom_data, bottom_data); - } + const auto in_top_ptr = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_0, in_top_ptr, fill_value); + float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, + y_val_1, in_bottom_ptr, fill_value); + float16x4_t res = {}; - if(pool_info.pool_type != PoolingType::MAX) - { - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float16x4_t scale_v = vdup_n_f16(scale); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } - const float16x4_t sum_data = vadd_f16(top_data, bottom_data); - res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); - } - else - { - const float16x4_t max_data = vmax_f16(top_data, bottom_data); - res = vpmax_f16(max_data, max_data); - } + if (pool_info.pool_type != PoolingType::MAX) + { + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = vsqrt_f16(res); - } + const float16x4_t sum_data = vadd_f16(top_data, bottom_data); + res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(top_data, bottom_data); + res = vpmax_f16(max_data, max_data); + } - // Store result - *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); - }, - in, out); + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = vsqrt_f16(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); } } -void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp16_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); - const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; - - execute_window_loop(window, [&](const Coordinates & id) - { - float16_t res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float16_t fp16_min = get_initial_min<half_float::half>(pool_info.use_inf_as_limit); + const float16_t fill_value = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) { - // Calculate scale - const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float16_t res = 0.0f; - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + // Calculate scale + const float16_t scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float16_t *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } - res += data; + res += data; + } } - } - // Divide by scale - res *= scale; - } - else // if max pooling - { - res = fp16_min; - - for(int y = 0; y < pool_size_y; ++y) + // Divide by scale + res *= scale; + } + else // if max pooling { - for(int x = 0; x < pool_size_x; ++x) - { - const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + res = fp16_min; - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - res = std::max(res, data); + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float16_t *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } - // Store result - *(reinterpret_cast<float16_t *>(out.ptr())) = res; - }, - in, out); + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = res; + }, + in, out); } #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - execute_window_loop(window, [&](const Coordinates & id) - { - float res = 0.0f; - - if(pool_info.pool_type != PoolingType::MAX) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + execute_window_loop( + window, + [&](const Coordinates &id) { - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + float res = 0.0f; - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - if(pool_info.pool_type == PoolingType::L2) - { - data *= data; - } + if (pool_info.pool_type == PoolingType::L2) + { + data *= data; + } - res += data; + res += data; + } } - } - // Divide by scale - res *= scale; - } - else // if max pooling - { - res = min_value; - - for(int y = 0; y < pool_size_y; ++y) + // Divide by scale + res *= scale; + } + else // if max pooling { - for(int x = 0; x < pool_size_x; ++x) - { - const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) - + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + res = min_value; - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; - res = std::max(res, data); + for (int y = 0; y < pool_size_y; ++y) + { + for (int x = 0; x < pool_size_x; ++x) + { + const auto ptr = reinterpret_cast<const float *>( + in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())); + + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + float data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr; + res = std::max(res, data); + } } } - } - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - res = std::sqrt(res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } - // Store result - *(reinterpret_cast<float *>(out.ptr())) = res; - }, - in, out); + // Store result + *(reinterpret_cast<float *>(out.ptr())) = res; + }, + in, out); } -void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { - if(pool_info.pool_type == PoolingType::MAX && dst1) + if (pool_info.pool_type == PoolingType::MAX && dst1) { pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window); } @@ -499,64 +567,168 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P { Iterator in(src, window_src); Iterator out(dst0, window); - constexpr int pool_size = 2; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); + + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, + in_top_ptr, fill_value); + auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_bottom_ptr, fill_value); + float32x2_t res = {}; + float final_res = 0; - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f32(top_data, top_data); + bottom_data = vmul_f32(bottom_data, bottom_data); + } + + if (pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x2_t sum_data = vadd_f32(top_data, bottom_data); + res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + } + else + { + const float32x2_t max_data = vmax_f32(top_data, bottom_data); + res = vpmax_f32(max_data, max_data); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } - execute_window_loop(window, [&](const Coordinates & id) + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); + } +} + +void pooling3_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + const uint8_t *const src_top_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_middle_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const uint8_t *const src_bottom_ptr = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop( + window, + [&](const Coordinates &id) { const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); + const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset()); const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - auto top_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value); - auto bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value); - float32x2_t res = {}; - float final_res = 0; + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, + fill_value); + auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, + in_middle_ptr, fill_value); + auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, + in_bottom_ptr, fill_value); + + float32x2_t res = {}; + float final_res = 0; // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { - top_data = vmul_f32(top_data, top_data); - bottom_data = vmul_f32(bottom_data, bottom_data); + top_data = vmulq_f32(top_data, top_data); + middle_data = vmulq_f32(middle_data, middle_data); + bottom_data = vmulq_f32(bottom_data, bottom_data); } - if(pool_info.pool_type != PoolingType::MAX) + if (pool_info.pool_type != PoolingType::MAX) { // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); const float32x2_t scale_v = vdup_n_f32(scale); // Perform pooling - const float32x2_t sum_data = vadd_f32(top_data, bottom_data); - res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); + res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); } else { - const float32x2_t max_data = vmax_f32(top_data, bottom_data); - res = vpmax_f32(max_data, max_data); + const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data)); + res = vpmax_f32(res, res); } final_res = vget_lane_f32(res, 0); // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { final_res = sqrt(final_res); } @@ -565,191 +737,120 @@ void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, P *(reinterpret_cast<float *>(out.ptr())) = final_res; }, in, out); - } -} - -void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) -{ - ARM_COMPUTE_UNUSED(dst1); - Iterator in(src, window_src); - Iterator out(dst0, window); - - constexpr const int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; - std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); - const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); - const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); - const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset()); - const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); - - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value); - auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value); - auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value); - - float32x2_t res = {}; - float final_res = 0; - - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - top_data = vmulq_f32(top_data, top_data); - middle_data = vmulq_f32(middle_data, middle_data); - bottom_data = vmulq_f32(bottom_data, bottom_data); - } - - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); - - // Perform pooling - const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); - res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data)); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); - - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } - - // Store result - *(reinterpret_cast<float *>(out.ptr())) = final_res; - }, - in, out); } -void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling7_fp32_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); Iterator out(dst0, window); - constexpr const int pool_size = 7; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr const int pool_size = 7; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); - const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; - - std::array<const uint8_t *, pool_size> src_ptrs{ {} }; - for(int i = 0; i < pool_size; ++i) + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit); + const float fill_value = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f; + + std::array<const uint8_t *, pool_size> src_ptrs{{}}; + for (int i = 0; i < pool_size; ++i) { - src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i)); + src_ptrs[i] = + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i)); } - execute_window_loop(window, [&](const Coordinates & id) - { - auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset()); - - auto x_val = id.x() * pool_stride_x; - auto y_val = id.y() * pool_stride_y; - float32x4x2_t data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); + execute_window_loop( + window, + [&](const Coordinates &id) + { + auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset()); - float32x2_t res = {}; - float final_res = 0.f; + auto x_val = id.x() * pool_stride_x; + auto y_val = id.y() * pool_stride_y; + float32x4x2_t data = + read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); - if(pool_info.pool_type != PoolingType::MAX) - { - // Calculate scale - float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - const float32x2_t scale_v = vdup_n_f32(scale); + float32x2_t res = {}; + float final_res = 0.f; - // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type != PoolingType::MAX) { - data.val[0] = vmulq_f32(data.val[0], data.val[0]); - data.val[1] = vmulq_f32(data.val[1], data.val[1]); - } - float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); - for(int i = 1; i < pool_size; ++i) - { - in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); + // Calculate scale + float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, + pool_size, upper_bound_w, upper_bound_h, pool_pad_left, + pool_pad_top, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); - x_val = id.x() * pool_stride_x; - y_val = (id.y() * pool_stride_y) + i; - data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); // Get power of 2 in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) + if (pool_info.pool_type == PoolingType::L2) { data.val[0] = vmulq_f32(data.val[0], data.val[0]); data.val[1] = vmulq_f32(data.val[1], data.val[1]); } - sum_data = vaddq_f32(sum_data, data.val[0]); - sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); + + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + data = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, + fill_value); + // Get power of 2 in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + sum_data = vaddq_f32(sum_data, data.val[0]); + sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + } + res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); } - res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); - res = vmul_f32(vpadd_f32(res, res), scale_v); - } - else - { - for(int i = 1; i < pool_size; ++i) + else { - in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); + for (int i = 1; i < pool_size; ++i) + { + in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset()); - x_val = id.x() * pool_stride_x; - y_val = (id.y() * pool_stride_y) + i; - float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value); - data = vmax2q_f32(data, temp); + x_val = id.x() * pool_stride_x; + y_val = (id.y() * pool_stride_y) + i; + float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, + in_ptr, fill_value); + data = vmax2q_f32(data, temp); + } + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1])); + res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0]))); + res = vpmax_f32(res, res); } - res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1])); - res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0]))); - res = vpmax_f32(res, res); - } - final_res = vget_lane_f32(res, 0); + final_res = vget_lane_f32(res, 0); - // Calculate square-root in case of l2 pooling - if(pool_info.pool_type == PoolingType::L2) - { - final_res = sqrt(final_res); - } + // Calculate square-root in case of l2 pooling + if (pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } - // Store result - *(reinterpret_cast<float *>(out.ptr())) = final_res; - }, - in, out); + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); } } // namespace cpu } // namespace arm_compute -#endif // ENABLE_NCHW_KERNELS
\ No newline at end of file +#endif // ENABLE_NCHW_KERNELS diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp index 7f8841edd8..44675b5394 100644 --- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp +++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute { namespace cpu { -void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp index 8643651f27..d434323e89 100644 --- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp @@ -25,17 +25,23 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool2d/neon/list.h" namespace arm_compute { namespace cpu { -void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h index a2cd3991be..38f1b2f1f9 100644 --- a/src/cpu/kernels/pool2d/neon/quantized.h +++ b/src/cpu/kernels/pool2d/neon/quantized.h @@ -26,11 +26,13 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + +#include "src/core/helpers/PoolingHelpers.h" #include "src/core/NEON/NEAsymm.h" #include "src/core/NEON/NEFixedPoint.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/PoolingHelpers.h" + #include <arm_neon.h> namespace arm_compute @@ -38,7 +40,12 @@ namespace arm_compute namespace cpu { template <typename T> -void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_q8_neon_nhwc(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); @@ -60,15 +67,15 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P using q32_t = typename wrapper::traits::promote_t<q16_t>; using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; const int pool_pad_right = pool_info.pad_stride_info.pad_right(); const int pool_pad_top = pool_info.pad_stride_info.pad_top(); const int pool_pad_left = pool_info.pad_stride_info.pad_left(); const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); @@ -80,233 +87,267 @@ void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, P const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); + const int32_t new_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - execute_window_loop(window_out, [&](const Coordinates & id) - { - const int idx_width = id.y() * pool_stride_x; - const int idx_height = id.z() * pool_stride_y; - const int pool_limit_y = pool_pad_top - idx_height; - const int pool_limit_x = pool_pad_left - idx_width; - - const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); - const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); - const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); - const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); - - int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + execute_window_loop( + window_out, + [&](const Coordinates &id) { - if(pool_info.pool_type != PoolingType::MAX) + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) { - q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); - - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = pool_start_x; x < pool_end_x; ++x) + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, + wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = + wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } } else { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); - } - } - else - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } } - } - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } } - } - if(pool_info.pool_type == PoolingType::MAX) - { - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) + if (pool_info.pool_type == PoolingType::MAX) { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); - for(int y = pool_start_y; y < pool_end_y; ++y) + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - for(int x = pool_start_x; x < pool_end_x; ++x) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + for (int y = pool_start_y; y < pool_end_y; ++y) { - const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - vres = wrapper::vmax(vres, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x8_t data = wrapper::vload( + reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + vres = wrapper::vmax(vres, data); + } } - } - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } } - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - if(pool_info.pool_type != PoolingType::MAX) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - q32_t res = static_cast<q32_t>(0.f); + if (pool_info.pool_type != PoolingType::MAX) + { + q32_t res = static_cast<q32_t>(0.f); - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Perform pooling - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + // Perform pooling + for (int y = pool_start_y; y < pool_end_y; ++y) { - const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - res += data; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res += data; + } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } else { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast<T>(0.5f + static_cast<float>(res) * scale); - - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; - } - } - else - { - T res = std::numeric_limits<T>::min(); + T res = std::numeric_limits<T>::min(); - for(int y = pool_start_y; y < pool_end_y; ++y) - { - for(int x = pool_start_x; x < pool_end_x; ++x) + for (int y = pool_start_y; y < pool_end_y; ++y) { - const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> - (src->info()->strides_in_bytes().z())) + x_off); - res = std::max(res, data); + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = + *(reinterpret_cast<const T *>( + in.ptr() + + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) + + x_off); + res = std::max(res, data); + } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } } - } - - }, - in, out); + }, + in, out); } #if defined(ENABLE_NCHW_KERNELS) template <typename T, typename TVec> -inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step, - const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) +inline void scale_vector_q16x8(bool exclude_padding, + TVec &v, + const Coordinates &id, + int id_offset, + int step, + const int pool_size, + const int upper_bound_w, + const int upper_bound_h, + const int pad_x, + const int pad_y, + const int stride_x, + const int stride_y) { int start_x = (id.x() + id_offset) * stride_x - pad_x; int start_y = id.y() * stride_y - pad_y; const int end_y = std::min(start_y + pool_size, upper_bound_h); - if(exclude_padding) + if (exclude_padding) { start_y = std::max(0, start_y); } - std::array<T, 8> elems = - { - { - wrapper::vgetlane(v, 0), - wrapper::vgetlane(v, 1), - wrapper::vgetlane(v, 2), - wrapper::vgetlane(v, 3), - wrapper::vgetlane(v, 4), - wrapper::vgetlane(v, 5), - wrapper::vgetlane(v, 6), - wrapper::vgetlane(v, 7), - } - }; - - for(auto &el : elems) + std::array<T, 8> elems = {{ + wrapper::vgetlane(v, 0), + wrapper::vgetlane(v, 1), + wrapper::vgetlane(v, 2), + wrapper::vgetlane(v, 3), + wrapper::vgetlane(v, 4), + wrapper::vgetlane(v, 5), + wrapper::vgetlane(v, 6), + wrapper::vgetlane(v, 7), + }}; + + for (auto &el : elems) { int c_start_x = start_x; const int end_x = std::min(c_start_x + pool_size, upper_bound_w); - if(exclude_padding) + if (exclude_padding) { c_start_x = std::max(0, c_start_x); } @@ -326,15 +367,16 @@ inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates } template <typename T> -auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval) +auto load16_boundary_aware( + int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval) { ARM_COMPUTE_UNUSED(pad_b, pad_r); T vec[16]; //handle reading a row out of the tensor const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t))); - for(int i = 0; i < 16; i++) + for (int i = 0; i < 16; i++) { - if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) + if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l))) { vec[i] = *(ptr + i); } @@ -349,24 +391,24 @@ auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, template <typename T, typename V, bool deinterleave> inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr) { - if(deinterleave) + if (deinterleave) { - for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i) { *(ptr + i * 2) = lower[i]; } - for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i) + for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i) { *(ptr + 1 + i * 2) = upper[i]; } } else { - for(int i = 0; i < 8 && (i + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) { *(ptr + i) = lower[i]; } - for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i) { *(ptr + i + 8) = upper[i]; } @@ -376,14 +418,19 @@ inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &up template <typename T, typename V> inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr) { - for(int i = 0; i < 8 && (i + x) < dst_w; ++i) + for (int i = 0; i < 8 && (i + x) < dst_w; ++i) { *(ptr + i) = v[i]; } } template <typename T> -void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling2_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -397,129 +444,136 @@ void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; - constexpr int pool_size = 2; - int pool_stride_x = 0; - int pool_stride_y = 0; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + constexpr int pool_size = 2; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); - const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); - const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); - const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + const T *const src_top_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); const int scale_step_x = (pool_stride_x == 1) ? 2 : 1; const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); const bool have_different_qinfo = src_qinfo != dst_qinfo; - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const int dst_w = dst0->info()->dimension(0); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const int dst_w = dst0->info()->dimension(0); const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - - auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); - auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; - q8x8_t lower_res = {}; - q8x8_t upper_res = {}; + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); - if(pool_info.pool_type != PoolingType::MAX) - { - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; + q8x8_t lower_res = {}; + q8x8_t upper_res = {}; - // Add rows - const q16x8x2_t vrsum = + if (pool_info.pool_type != PoolingType::MAX) { - { + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Add rows + const q16x8x2_t vrsum = {{ wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), - } - }; + }}; - // Pair-wise add row data - const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); - const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); + // Pair-wise add row data + const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); + const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); - q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); + q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); - // Scale lower result - scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - lower_res = wrapper::vmovn(res_lower); + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + lower_res = wrapper::vmovn(res_lower); - // Compute upper result for stride_x == 1 - if(pool_stride_x == 1) - { - // Shifted row sum - const q16x8x2_t vrsum_shifted = + // Compute upper result for stride_x == 1 + if (pool_stride_x == 1) { - { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) - } - }; - - // Pair-wise add shifted row - q16x8_t res_upper = wrapper::vcombine( - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), - wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1]))); - - // Scale upper result - scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - upper_res = wrapper::vmovn(res_upper); + // Shifted row sum + const q16x8x2_t vrsum_shifted = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + + // Pair-wise add shifted row + q16x8_t res_upper = wrapper::vcombine( + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), + wrapper::vgethigh(vrsum_shifted.val[1]))); + + // Scale upper result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + upper_res = wrapper::vmovn(res_upper); + } } - } - else - { - const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); - lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); - if(pool_stride_x == 1) + else { - const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); - upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); + lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); + if (pool_stride_x == 1) + { + const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); + upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + } } - } - if(have_different_qinfo) - { - const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo); - lower_res = wrapper::vgetlow(requantized_dst); - upper_res = wrapper::vgethigh(requantized_dst); - } - auto out_ptr = reinterpret_cast<T *>(out.ptr()); - // Store result - if(pool_stride_x == 1) - { - write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr); - } - else - { - write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr); - } - }, - in, out); + if (have_different_qinfo) + { + const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo); + lower_res = wrapper::vgetlow(requantized_dst); + upper_res = wrapper::vgethigh(requantized_dst); + } + auto out_ptr = reinterpret_cast<T *>(out.ptr()); + // Store result + if (pool_stride_x == 1) + { + write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr); + } + else + { + write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr); + } + }, + in, out); } template <typename T> -void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void pooling3_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -533,13 +587,13 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; - constexpr int pool_size = 3; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + constexpr int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); @@ -547,147 +601,145 @@ void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *ds const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); - const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); - const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2))); + const T *const src_top_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_middle_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>( + src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2))); const int src_w = src->info()->dimension(0); const int src_h = src->info()->dimension(1); const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min(); const int dst_w = dst0->info()->dimension(0); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto x_val = id.x() * pool_stride_x; - const auto y_val_0 = id.y() * pool_stride_y; - const auto y_val_1 = (id.y() * pool_stride_y) + 1; - const auto y_val_2 = (id.y() * pool_stride_y) + 2; - - auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); - auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value); - auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, - x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); - - q8x8_t fres = {}; - q8x16_t fqres = {}; - - if(pool_info.pool_type == PoolingType::AVG) + execute_window_loop( + window, + [&](const Coordinates &id) { - // Convert data to u16 - const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; - const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } }; - const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; - - // Calculate row sums - const q16x8x2_t vrsum = + const auto x_val = id.x() * pool_stride_x; + const auto y_val_0 = id.y() * pool_stride_y; + const auto y_val_1 = (id.y() * pool_stride_y) + 1; + const auto y_val_2 = (id.y() * pool_stride_y) + 2; + + auto top_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value); + auto middle_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value); + auto bottom_data = + load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val, + y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value); + + q8x8_t fres = {}; + q8x16_t fqres = {}; + + if (pool_info.pool_type == PoolingType::AVG) { + // Convert data to u16 + const q16x8x2_t top_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}}; + const q16x8x2_t middle_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}}; + const q16x8x2_t bottom_data_q16 = { + {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}}; + + // Calculate row sums + const q16x8x2_t vrsum = {{ + wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), + wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + }}; + const q16x8x2_t vrsum_shifted_1 = { + {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}}; + const q16x8x2_t vrsum_shifted_2 = { + {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}}; + // Calculate final sum + q16x8x2_t final_sum = {{ + wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), + wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + }}; + if (pool_stride_x == 2) { - wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), - wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + q16x8_t res = { + wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2), + wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6), + wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2), + wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6), + }; + + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fres = wrapper::vmovn(res); } - }; - const q16x8x2_t vrsum_shifted_1 = - { + else { - wrapper::vext_1(vrsum.val[0], vrsum.val[1]), - wrapper::vext_1(vrsum.val[1], vrsum.val[1]) + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size, + upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, + pool_stride_x, pool_stride_y); + fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); } - }; - const q16x8x2_t vrsum_shifted_2 = + } + else { + const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); + const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); + const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); + const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); + + if (pool_stride_x == 2) { - wrapper::vext_2(vrsum.val[0], vrsum.val[1]), - wrapper::vext_2(vrsum.val[1], vrsum.val[1]) + const q8x8x2_t table = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}}; + static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14}; + fres = wrapper::vtbl(table, lookup_val); } - }; - // Calculate final sum - q16x8x2_t final_sum = - { + else { - wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), - wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + fqres = final_max; } - }; - if(pool_stride_x == 2) - { - q16x8_t res = - { - wrapper::vgetlane(final_sum.val[0], 0), - wrapper::vgetlane(final_sum.val[0], 2), - wrapper::vgetlane(final_sum.val[0], 4), - wrapper::vgetlane(final_sum.val[0], 6), - wrapper::vgetlane(final_sum.val[1], 0), - wrapper::vgetlane(final_sum.val[1], 2), - wrapper::vgetlane(final_sum.val[1], 4), - wrapper::vgetlane(final_sum.val[1], 6), - }; - - scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fres = wrapper::vmovn(res); } - else - { - // Scale lower result - scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - // Scale lower result - scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, - pool_size, upper_bound_w, upper_bound_h, - pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); - fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); - } - } - else - { - const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); - const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); - const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); - const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); - if(pool_stride_x == 2) + // Store result + if (pool_stride_x == 1) { - const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } }; - static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; - fres = wrapper::vtbl(table, lookup_val); + if (src_qinfo != dst_qinfo) + { + fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), + requant_qinfo); + } + write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), + wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr())); } else { - fqres = final_max; - } - } - - // Store result - if(pool_stride_x == 1) - { - if(src_qinfo != dst_qinfo) - { - fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo); - } - write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr())); - } - else - { - if(src_qinfo != dst_qinfo) - { - fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo); + if (src_qinfo != dst_qinfo) + { + fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo); + } + write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr())); } - write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr())); - } - }, - in, out); + }, + in, out); } template <typename T> -void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +void poolingMxN_quantized_neon_nchw(const ITensor *src, + ITensor *dst0, + ITensor *dst1, + PoolingLayerInfo &pool_info, + const Window &window_src, + const Window &window) { ARM_COMPUTE_UNUSED(dst1); Iterator in(src, window_src); @@ -697,74 +749,81 @@ void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor * using q16_t = typename wrapper::traits::promote_t<T>; using q32_t = typename wrapper::traits::promote_t<q16_t>; - const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; - const int pool_pad_right = pool_info.pad_stride_info.pad_right(); - const int pool_pad_top = pool_info.pad_stride_info.pad_top(); - const int pool_pad_left = pool_info.pad_stride_info.pad_left(); - const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); - int pool_stride_x = 0; - int pool_stride_y = 0; + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); - const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); - const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); - const int src_w = src->info()->dimension(0); - const int src_h = src->info()->dimension(1); - const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min(); - const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x()); - const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y()); - - execute_window_loop(window, [&](const Coordinates & id) - { - T res = std::numeric_limits<T>::min(); - - if(pool_info.pool_type != PoolingType::MAX) + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + const int src_w = src->info()->dimension(0); + const int src_h = src->info()->dimension(1); + const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min(); + const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x()); + const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y()); + + execute_window_loop( + window, + [&](const Coordinates &id) { - q32_t sres = 0; - - // Calculate scale - const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, - pool_stride_y); + T res = std::numeric_limits<T>::min(); - // Perform pooling - for(int y = 0; y < pool_size_y; ++y) + if (pool_info.pool_type != PoolingType::MAX) { - for(int x = 0; x < pool_size_x; ++x) + q32_t sres = 0; + + // Calculate scale + const float scale = calculate_avg_scale_pool2d( + pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, + upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + + // Perform pooling + for (int y = 0; y < pool_size_y; ++y) { - const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast<const T *>( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; - sres += data; + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + sres += data; + } } + // Divide by scale + res = static_cast<T>(support::cpp11::round(sres * scale)); } - // Divide by scale - res = static_cast<T>(support::cpp11::round(sres * scale)); - } - else - { - for(int y = 0; y < pool_size_y; ++y) + else { - for(int x = 0; x < pool_size_x; ++x) + for (int y = 0; y < pool_size_y; ++y) { - const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); + for (int x = 0; x < pool_size_x; ++x) + { + const auto in_ptr = reinterpret_cast<const T *>( + in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes); - const int idx = x + id.x() * pool_stride_x - pool_pad_left; - const int idy = y + id.y() * pool_stride_y - pool_pad_top; - const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; - res = std::max(res, data); + const int idx = x + id.x() * pool_stride_x - pool_pad_left; + const int idy = y + id.y() * pool_stride_y - pool_pad_top; + const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr; + res = std::max(res, data); + } } } - } - // Store result - res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res; - *(reinterpret_cast<T *>(out.ptr())) = res; - }, - in, out); + // Store result + res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize( + Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) + : res; + *(reinterpret_cast<T *>(out.ptr())) = res; + }, + in, out); } #endif /* defined(ENABLE_NCHW_KERNELS) */ } // namespace cpu diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h index 013e25537c..ce89199b5d 100644 --- a/src/cpu/kernels/pool3d/neon/impl.h +++ b/src/cpu/kernels/pool3d/neon/impl.h @@ -25,9 +25,10 @@ #define SRC_CORE_POOLING_3D_LAYER_IMPL_H #include "arm_compute/core/Helpers.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool3d/neon/quantized.h" namespace arm_compute @@ -37,8 +38,13 @@ namespace cpu namespace { template <typename T> -void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_start_x, const int window_end_x, const int window_step_x) +void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; @@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vmax(vres, data); + } } } + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); } - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); - res = -std::numeric_limits<float>::infinity(); - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + res = -std::numeric_limits<float>::infinity(); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res = std::max(res, data); + } } } + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; } - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; - } - }, - out); + }, + out); } template <typename T> -void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; using vector_type = typename vtype::type; @@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); - const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - vres = wrapper::vadd(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vadd(vres, data); + } } } - } - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + }, + out); } template <typename T> -void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; using vector_type = typename vtype::type; @@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - vres = wrapper::vmla(vres, data, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + vres = wrapper::vmla(vres, data, data); + } } } - } - - const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type()); - // Calculate square-root - vres = wrapper::vinv(wrapper::vinvsqrt(vres)); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); - } + // Calculate square-root + vres = wrapper::vinv(wrapper::vinvsqrt(vres)); - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - res += data * data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data * data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Square root - res = std::sqrt(res); + // Square root + res = std::sqrt(res); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + }, + out); } } // namespace @@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: - max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::AVG: - avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::L2: - l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; default: ARM_COMPUTE_ERROR("Pool operation not supported"); @@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x); diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h index ac14f5eafa..8819907901 100644 --- a/src/cpu/kernels/pool3d/neon/quantized.h +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -26,17 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template <typename T> -void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void avg_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; @@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + const int32_t new_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); - } - else - { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } } - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - q32_t res = static_cast<q32_t>(0.f); - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32_t res = static_cast<q32_t>(0.f); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data; + } } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; - } - else - { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } template <typename T> -void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void max_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; @@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); - } - // Leftovers using half the window step - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Leftovers using half the window step + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res = std::numeric_limits<T>::min(); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res = std::numeric_limits<T>::min(); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + res = std::max(res, data); + } } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp index 5d50dce907..505c18c27d 100644 --- a/src/cpu/kernels/range/generic/neon/fp16.cpp +++ b/src/cpu/kernels/range/generic/neon/fp16.cpp @@ -23,10 +23,10 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/range/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" namespace arm_compute { diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp index 6044f0f886..e5e472abb5 100644 --- a/src/cpu/kernels/range/generic/neon/fp32.cpp +++ b/src/cpu/kernels/range/generic/neon/fp32.cpp @@ -22,10 +22,10 @@ * SOFTWARE. */ -#include "src/cpu/kernels/range/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/range/generic/neon/impl.h" namespace arm_compute { diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h index 62144e6776..f8c30d52a0 100644 --- a/src/cpu/kernels/range/generic/neon/impl.h +++ b/src/cpu/kernels/range/generic/neon/impl.h @@ -26,8 +26,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { @@ -47,35 +48,36 @@ void neon_range_function(ITensor *output, float start, float step, const Window const auto window_end_x = static_cast<int>(window.x().end()); const int window_step_x = 16 / sizeof(T); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator output_it(output, win); - execute_window_loop(win, [&](const Coordinates &) - { - int x = window_start_x; - const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - for(int count = 0; count < window_step_x; ++count) + int x = window_start_x; + const auto out_ptr = reinterpret_cast<T *>(output_it.ptr()); + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count); - } - - // start + step * id - const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); - wrapper::vstore(out_ptr + x, res_vec); - } + for (int count = 0; count < window_step_x; ++count) + { + id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto res = start + x * step; - *(out_ptr + x) = res; - } + // start + step * id + const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec); + wrapper::vstore(out_ptr + x, res_vec); + } - }, - output_it); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto res = start + x * step; + *(out_ptr + x) = res; + } + }, + output_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h index 25d52bfe7f..cade91e8dd 100644 --- a/src/cpu/kernels/range/list.h +++ b/src/cpu/kernels/range/list.h @@ -28,8 +28,7 @@ namespace arm_compute { namespace cpu { -#define DECLARE_RANGE_KERNEL(func_name) \ - void func_name(ITensor *output, float start, float step, const Window &window) +#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window) DECLARE_RANGE_KERNEL(fp16_neon_range_function); DECLARE_RANGE_KERNEL(fp32_neon_range_function); diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp index c265d5d4eb..cf99830562 100644 --- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp +++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp @@ -29,7 +29,12 @@ namespace arm_compute { namespace cpu { -void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_fp16_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp index 51355aaef0..c1dba99b5e 100644 --- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp +++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_fp32_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align<float, float>(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h index e5e604330a..db2f67705d 100644 --- a/src/cpu/kernels/roialign/generic/neon/impl.h +++ b/src/cpu/kernels/roialign/generic/neon/impl.h @@ -46,7 +46,7 @@ inline input_data_type roi_align_1x1(const ITensor *input, float region_end_y, int pz) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { return input_data_type(0); } @@ -55,9 +55,9 @@ inline input_data_type roi_align_1x1(const ITensor *input, const DataLayout data_layout = input->info()->data_layout(); float avg = 0; // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) + for (int iy = 0; iy < grid_size_y; ++iy) { - for(int ix = 0; ix < grid_size_x; ++ix) + for (int ix = 0; ix < grid_size_x; ++ix) { // Align the window in the middle of every bin float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); @@ -78,20 +78,28 @@ inline input_data_type roi_align_1x1(const ITensor *input, const float w2 = hy * lx; const float w3 = ly * hx; const float w4 = ly * lx; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); - const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); - const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); - const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); + const auto data1 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))); + const auto data2 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))); + const auto data3 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))); + const auto data4 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); - const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); - const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); - const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); + const auto data1 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))); + const auto data2 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))); + const auto data3 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))); + const auto data4 = *reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } @@ -117,21 +125,21 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, int pz, const QuantizationInfo &out_qinfo) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { return input_data_type(out_qinfo.uniform().offset); } else { - float avg = 0; - const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); - const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); - const DataLayout data_layout = input->info()->data_layout(); + float avg = 0; + const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform(); + const bool is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type()); + const DataLayout data_layout = input->info()->data_layout(); // Iterate through the aligned pooling region - for(int iy = 0; iy < grid_size_y; ++iy) + for (int iy = 0; iy < grid_size_y; ++iy) { - for(int ix = 0; ix < grid_size_x; ++ix) + for (int ix = 0; ix < grid_size_x; ++ix) { // Align the window in the middle of every bin float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y); @@ -153,41 +161,89 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, const float w3 = ly * hx; const float w4 = ly * lx; - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - if(is_qasymm_signed) + if (is_qasymm_signed) { - float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); + float data1 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo); - float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo); - float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo); - float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo); + float data1 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), + input_qinfo); + float data2 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), + input_qinfo); + float data3 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), + input_qinfo); + float data4 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } else { - if(is_qasymm_signed) + if (is_qasymm_signed) { - const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); + const auto data1 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element( + Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } else { - const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo); - const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo); - const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo); - const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo); + const auto data1 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), + input_qinfo); + const auto data2 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), + input_qinfo); + const auto data3 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), + input_qinfo); + const auto data4 = + dequantize_qasymm8(*reinterpret_cast<const input_data_type *>( + input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), + input_qinfo); avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4; } } @@ -197,7 +253,7 @@ inline input_data_type roi_align_1x1_qasymm8(const ITensor *input, avg /= grid_size_x * grid_size_y; input_data_type res = 0; - if(is_qasymm_signed) + if (is_qasymm_signed) { res = quantize_qasymm8_signed(avg, out_qinfo); } @@ -215,7 +271,12 @@ inline float compute_region_coordinate(int p, float bin_size, float roi_anchor, } template <typename input_data_type, typename roi_data_type> -void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void roi_align(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { ARM_COMPUTE_UNUSED(info); @@ -240,7 +301,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo const auto *rois_ptr = reinterpret_cast<const roi_data_type *>(rois->buffer()); const QuantizationInfo &rois_qinfo = rois->info()->quantization_info(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) { const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; @@ -252,7 +313,7 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo float x2(qx2); float y1(qy1); float y2(qy2); - if(is_qasymm) + if (is_qasymm) { x1 = dequantize_qasymm16(qx1, rois_qinfo); x2 = dequantize_qasymm16(qx2, rois_qinfo); @@ -267,44 +328,47 @@ void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPo float bin_size_y = roi_dims_y / pool_info.pooled_height(); // Iterate through all feature maps - for(int ch = 0; ch < input_chanels; ++ch) + for (int ch = 0; ch < input_chanels; ++ch) { // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) + for (int py = 0; py < pooled_h; ++py) { - for(int px = 0; px < pooled_w; ++px) + for (int px = 0; px < pooled_w; ++px) { - const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); - const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); - const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); - const float region_end_y = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); - const int roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x)); - const int roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y)); + const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width); + const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height); + const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width); + const float region_end_y = + compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height); + const int roi_bin_grid_x = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x)); + const int roi_bin_grid_y = + (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y)); input_data_type out_val(0); - if(is_qasymm) + if (is_qasymm) { out_val = roi_align_1x1_qasymm8<input_data_type>( - input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info()); + input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info()); } else { - out_val = roi_align_1x1<input_data_type>( - input, roi_batch, region_start_x, bin_size_x, - roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, - roi_bin_grid_y, region_end_y, ch); + out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x, + roi_bin_grid_x, region_end_x, region_start_y, + bin_size_y, roi_bin_grid_y, region_end_y, ch); } - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); - *out_ptr = out_val; + auto out_ptr = reinterpret_cast<input_data_type *>( + output->ptr_to_element(Coordinates(px, py, ch, roi_indx))); + *out_ptr = out_val; } else { - auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); - *out_ptr = out_val; + auto out_ptr = reinterpret_cast<input_data_type *>( + output->ptr_to_element(Coordinates(ch, px, py, roi_indx))); + *out_ptr = out_val; } } } diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp index d6bd9a95ce..11c5770f53 100644 --- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_qu8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp index a839581aff..7f93cc87b3 100644 --- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp @@ -26,7 +26,12 @@ namespace arm_compute { namespace cpu { -void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +void neon_qs8_roialign(const ITensor *input, + ITensor *output, + const ITensor *rois, + ROIPoolingLayerInfo pool_info, + const Window &window, + const ThreadInfo &info) { return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info); } diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h index 1c71b02488..fdb3c0050d 100644 --- a/src/cpu/kernels/roialign/list.h +++ b/src/cpu/kernels/roialign/list.h @@ -27,9 +27,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_ROIALIGN_KERNEL(func_name) \ - void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \ - ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info) +#define DECLARE_ROIALIGN_KERNEL(func_name) \ + void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \ + const Window &window, const ThreadInfo &info) DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign); DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign); DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign); diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp index 895f42215e..bd01569cc4 100644 --- a/src/cpu/kernels/scale/neon/fp16.cpp +++ b/src/cpu/kernels/scale/neon/fp16.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -41,8 +42,12 @@ namespace arm_compute { namespace { -void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -62,33 +67,46 @@ void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *of const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &id) { - *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); Iterator out(dst, window); const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; @@ -103,68 +121,97 @@ void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *o win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator in(src, win_in); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type; const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float16_t *in_ptr = + reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<float16_t *>(out.ptr()) = + static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); - auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); - auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); - auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast<float16_t *>(out.ptr()) = + static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } else { ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -172,4 +219,4 @@ void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, c } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp index 2ab14cf83a..bbf92e0412 100644 --- a/src/cpu/kernels/scale/neon/integer.cpp +++ b/src/cpu/kernels/scale/neon/integer.cpp @@ -22,8 +22,9 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/ScaleHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -33,8 +34,12 @@ namespace arm_compute { namespace { -void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void u8_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -54,43 +59,58 @@ void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) - { - *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int input_width = src->info()->dimension(1); const int input_height = src->info()->dimension(2); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { Iterator out(dst, window); - const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; - const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_stride_wc = + in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom); // Don't increment in Y and Z direction for the input tensor // A pointer to the start of this plane is needed as base for the precomputed offsets @@ -100,24 +120,37 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off Iterator in(src, win_in); const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset); - const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset); + const uint8_t *in_ptr = + reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) + ? *in_ptr + : const_border_value; + const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<uint8_t *>(out.ptr()) = + static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; @@ -152,12 +185,12 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; uint8_t *out_ptr = out.ptr() + bo * out_stride_b; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -174,7 +207,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -205,7 +238,7 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -270,19 +303,21 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const auto out_2_int = wrapper::vcvta<uint32_t>(out_2); const auto out_3_int = wrapper::vcvta<uint32_t>(out_3); #else // defined(__aarch64__) && !defined(BARE_METAL) - const auto out_0_int = wrapper::vcvt<uint32_t>(out_0); - const auto out_1_int = wrapper::vcvt<uint32_t>(out_1); - const auto out_2_int = wrapper::vcvt<uint32_t>(out_2); - const auto out_3_int = wrapper::vcvt<uint32_t>(out_3); + const auto out_0_int = wrapper::vcvt<uint32_t>(out_0); + const auto out_1_int = wrapper::vcvt<uint32_t>(out_1); + const auto out_2_int = wrapper::vcvt<uint32_t>(out_2); + const auto out_3_int = wrapper::vcvt<uint32_t>(out_3); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -311,18 +346,27 @@ void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off } } -void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value); - if(border_mode == BorderMode::REPLICATE) + if (border_mode == BorderMode::REPLICATE) { using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_x = src->info()->strides_in_bytes()[1]; const int in_stride_y = src->info()->strides_in_bytes()[2]; @@ -356,12 +400,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b); int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b); - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -378,7 +422,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -409,7 +453,7 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -479,14 +523,16 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off const auto out_2_int = wrapper::vcvt<int32_t>(out_2); const auto out_3_int = wrapper::vcvt<int32_t>(out_3); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -515,8 +561,12 @@ void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *off } } -void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void s16_neon_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -536,33 +586,46 @@ void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - int32_t x = window_start_x; - const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - - for(; x <= window_end_x - window_step_x; x += window_step_x) - { - wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x, - wrapper::vloadq(in_ptr + offset + offset_row + x)); - } - for(; x < window_end_x; ++x) + execute_window_loop( + win, + [&](const Coordinates &id) { - *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); - } - }, - out); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for (; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for (; x < window_end_x; ++x) + { + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); } -void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Compute the ratio between source height and destination height - const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const auto hr = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); Iterator out(dst, window); const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; @@ -577,64 +640,93 @@ void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *of win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); Iterator in(src, win_in); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; - - const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; - const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; - const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; - const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; - - *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const int16_t *in_ptr = + reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = + (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) + ? *(in_ptr + in_stride_c) + : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_wc) + : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) + ? *(in_ptr + in_stride_c + in_stride_wc) + : const_border_value; + + *reinterpret_cast<int16_t *>(out.ptr()) = + static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - execute_window_loop(window, [&](const Coordinates & id) - { - const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); - const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); - const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); - - const auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); - const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); - const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); - const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); - - const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); - const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); - const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); - const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); - - *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const auto offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + const auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = + *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + + clamped_h1 * in_stride_wc); + + *reinterpret_cast<int16_t *>(out.ptr()) = + static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); } else { ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { @@ -642,32 +734,50 @@ void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con } } -void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } } -void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h index 28a1087224..0fe87d15a6 100644 --- a/src/cpu/kernels/scale/neon/list.h +++ b/src/cpu/kernels/scale/neon/list.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Window.h" + #include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -34,10 +35,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) DECLARE_SCALE_KERNEL(s16_neon_scale); DECLARE_SCALE_KERNEL(u8_neon_scale); @@ -48,14 +49,20 @@ DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale); #undef DECLARE_SCALE_KERNEL template <typename T> -void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, - bool align_corners, const Window &window) +void nearest_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(offsets); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_y = src->info()->strides_in_bytes()[1]; const int in_stride_z = src->info()->strides_in_bytes()[2]; @@ -84,17 +91,17 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets const int bo_end = window_execution[3].end(); const int bo_step = window_execution[3].step(); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate float yi_f = ((yo + sampling_offset) * scale_y); int yi = 0; - if(align_corners) + if (align_corners) { yi = utils::rounding::round_half_away_from_zero(yi_f); } @@ -103,12 +110,12 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets yi = static_cast<int>(std::floor(yi_f)); } - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate float xi_f = ((xo + sampling_offset) * scale_x); int xi = 0; - if(align_corners) + if (align_corners) { xi = utils::rounding::round_half_away_from_zero(xi_f); } @@ -121,15 +128,15 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { - auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); + auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0; } } @@ -138,9 +145,16 @@ void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets } template <typename T> -void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void bilinear_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(offsets); ARM_COMPUTE_UNUSED(dx); @@ -148,8 +162,10 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); const int in_stride_y = src->info()->strides_in_bytes()[1]; const int in_stride_z = src->info()->strides_in_bytes()[2]; @@ -180,7 +196,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int bo_end = window_execution[3].end(); const int bo_step = window_execution[3].step(); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; @@ -189,12 +205,12 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>()); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w; uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); @@ -204,7 +220,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const auto a1 = (yi_f - static_cast<float>(yi)); const auto b1 = (1.f - a1); - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); @@ -223,32 +239,35 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{}); - if((yi >= 0) && (yi < in_dim_h)) + if ((yi >= 0) && (yi < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y)); + in01 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y)); } } - if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { - in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z)); + in10 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z)); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + in11 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); } } @@ -264,32 +283,33 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { auto in00 = static_cast<T>(const_border_value); auto in01 = static_cast<T>(const_border_value); auto in10 = static_cast<T>(const_border_value); auto in11 = static_cast<T>(const_border_value); - if((yi >= 0) && (yi < in_dim_h)) + if ((yi >= 0) && (yi < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T))); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y)); } } - if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) + if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h)) { - if((xi >= 0) && (xi < in_dim_w)) + if ((xi >= 0) && (xi < in_dim_w)) { in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z)); } - if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) + if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w)) { - in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); + in11 = *( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z)); } } auto out0 = static_cast<T>(0); @@ -303,14 +323,14 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset } } } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_w; uint8_t *out_ptr = out.ptr() + bo * out_stride_w; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset); @@ -327,7 +347,7 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int yi1_offset = yi1 * in_stride_z; const int y_offset = yo * out_stride_z; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset); @@ -356,12 +376,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset const int offset = xo * out_stride_y + y_offset; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { - const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); - const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); - const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); - const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + const auto in00 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const auto in01 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const auto in10 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const auto in11 = wrapper::vloadq( + reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); auto out0 = wrapper::vmul(in00, s00); out0 = wrapper::vmla(out0, in01, s01); @@ -370,12 +394,16 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { - const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); - const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); - const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); - const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); + const T in00 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset)); + const T in01 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset)); + const T in10 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset)); + const T in11 = + *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset)); T out0 = in00 * s00_s; out0 += in01 * s01_s; @@ -394,15 +422,24 @@ void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offset } template <typename T> -void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void common_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp index 778459ae39..62a821daa5 100644 --- a/src/cpu/kernels/scale/neon/qasymm8.cpp +++ b/src/cpu/kernels/scale/neon/qasymm8.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace { -void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Data layout is NHWC const int32_t input_width = src->info()->dimension(1); @@ -40,10 +47,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; @@ -59,7 +68,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor win_in.set(1, Window::Dimension(0, 0, 0)); win_in.set(2, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -68,36 +77,41 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor Iterator out(dst, window); const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); - const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); - *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; @@ -141,12 +155,12 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const float fp_coord_offset_y = sampling_offset * (scale_y - 1); const float fp_coord_offset_x = sampling_offset * (scale_x - 1); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const uint8_t *in_ptr = in.ptr() + bo * in_stride_b; uint8_t *out_ptr = out.ptr() + bo * out_stride_b; - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -163,7 +177,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -194,7 +208,7 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -204,34 +218,82 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const uint16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); - const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in); - const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in); - const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in); - const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in); + const auto in00_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), + vscale_in); + const auto in00_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), + vscale_in); const uint16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); - const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in); - const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in); - const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in); - const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in); + const auto in01_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), + vscale_in); + const auto in01_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), + vscale_in); const uint16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); - const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in); - const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in); - const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in); - const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in); + const auto in10_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), + vscale_in); + const auto in10_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), + vscale_in); const uint16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); - const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in); - const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in); - const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in); - const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in); + const auto in11_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), + vscale_in); + const auto in11_3 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub( + wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), + vscale_in); auto out_0 = wrapper::vmul(in00_0, s00); out_0 = wrapper::vmla(out_0, in01_0, s01); @@ -264,14 +326,16 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t)); const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t)); @@ -292,7 +356,8 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor #if defined(__aarch64__) && !defined(BARE_METAL) *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info); #else // defined(__aarch64__) && !defined(BARE_METAL) - *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO); + *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = + quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO); #endif // defined(__aarch64__) && !defined(BARE_METAL) } } @@ -304,28 +369,38 @@ void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - if(src->info()->quantization_info() == dst->info()->quantization_info()) + if (src->info()->quantization_info() == dst->info()->quantization_info()) { - u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window); + u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { - qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp index cd63dfba63..5a885178a7 100644 --- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp @@ -28,9 +28,16 @@ namespace arm_compute { namespace { -void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_neon_scale_bilinear(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { // Data layout is NHWC const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); @@ -40,10 +47,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int32_t input_height = src->info()->dimension(2); // Compute the ratio between source and destination dimensions - const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); - const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + const float scale_x = + scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners); + const float scale_y = + scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); - if(border_mode == BorderMode::CONSTANT) + if (border_mode == BorderMode::CONSTANT) { const int32_t in_stride_y = src->info()->strides_in_bytes()[1]; const int32_t in_stride_z = src->info()->strides_in_bytes()[2]; @@ -58,7 +67,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const win_in.set(1, Window::Dimension(0, 0, 0)); win_in.set(2, Window::Dimension(0, 0, 0)); - for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) { win_off.set(d, Window::Dimension(0, 0, 0)); } @@ -67,36 +76,41 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const Iterator out(dst, window); const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>()); - execute_window_loop(window, [&](const Coordinates & id) - { - const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); - const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); - const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); - const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); - - const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) : - const_border_value; - const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ? - (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) : - const_border_value; - - const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); - const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); - const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); - const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); - *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset); + const int32_t index_w = + *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) + : const_border_value; + const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + const auto a11 = + (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) + ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) + : const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize( + scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); } - else if(border_mode == BorderMode::REPLICATE) + else if (border_mode == BorderMode::REPLICATE) { using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; @@ -140,12 +154,12 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{}); const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset); - for(int bo = bo_start; bo < bo_end; bo += bo_step) + for (int bo = bo_start; bo < bo_end; bo += bo_step) { const int8_t *in_ptr = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b); int8_t *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b); - for(int yo = yo_start; yo < yo_end; yo += yo_step) + for (int yo = yo_start; yo < yo_end; yo += yo_step) { // Floating-point coordinate const float yi_f = yo * scale_y + fp_coord_offset_y; @@ -162,7 +176,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y; int8_t *out_ptr_yo = out_ptr + yo * out_stride_y; - for(int xo = xo_start; xo < xo_end; xo += xo_step) + for (int xo = xo_start; xo < xo_end; xo += xo_step) { // Floating-point coordinate const float xi_f = xo * scale_x + fp_coord_offset_x; @@ -193,7 +207,7 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x; int cout = 0; - for(; cout <= (out_dim_ch - step_cout); cout += step_cout) + for (; cout <= (out_dim_ch - step_cout); cout += step_cout) { const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -203,34 +217,70 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const int16x8_t in00_low = wrapper::vmovl(wrapper::vgetlow(in00)); const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00)); - const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in); - const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in); - const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in); - const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in); + const auto in00_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), + vscale_in); + const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), + vscale_in); + const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), + vscale_in); + const auto in00_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), + vscale_in); const int16x8_t in01_low = wrapper::vmovl(wrapper::vgetlow(in01)); const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01)); - const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in); - const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in); - const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in); - const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in); + const auto in01_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), + vscale_in); + const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), + vscale_in); + const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), + vscale_in); + const auto in01_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), + vscale_in); const int16x8_t in10_low = wrapper::vmovl(wrapper::vgetlow(in10)); const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10)); - const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in); - const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in); - const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in); - const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in); + const auto in10_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), + vscale_in); + const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), + vscale_in); + const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), + vscale_in); + const auto in10_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), + vscale_in); const int16x8_t in11_low = wrapper::vmovl(wrapper::vgetlow(in11)); const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11)); - const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in); - const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in); - const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in); - const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in); + const auto in11_0 = wrapper::vmul( + wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), + vscale_in); + const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), + vscale_in); + const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub( + wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), + vscale_in); + const auto in11_3 = + wrapper::vmul(wrapper::vcvt<float>( + wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), + vscale_in); auto out_0 = wrapper::vmul(in00_0, s00); out_0 = wrapper::vmla(out_0, in01_0, s01); @@ -263,14 +313,16 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o)); const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o)); #endif // defined(__aarch64__) && !defined(BARE_METAL) - const auto low_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); - const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); - const auto out = wrapper::vcombine(low_part, high_part); + const auto low_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int))); + const auto high_part = + wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int))); + const auto out = wrapper::vcombine(low_part, high_part); wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out); } - for(; cout < out_dim_ch; ++cout) + for (; cout < out_dim_ch; ++cout) { const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t)); const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t)); @@ -291,7 +343,8 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const #if defined(__aarch64__) && !defined(BARE_METAL) *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info); #else // defined(__aarch64__) && !defined(BARE_METAL) - *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO); + *(out_ptr_xo_yo + cout * sizeof(int8_t)) = + quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO); #endif // defined(__aarch64__) && !defined(BARE_METAL) } } @@ -303,28 +356,39 @@ void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ARM_COMPUTE_ERROR("Not implemented"); } } -} +} // namespace namespace cpu { -void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_neon_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { - if(policy == InterpolationPolicy::BILINEAR) + if (policy == InterpolationPolicy::BILINEAR) { - if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE) + if (src->info()->quantization_info() == dst->info()->quantization_info() && + border_mode == BorderMode::REPLICATE) { - s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window); + s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, + align_corners, window); } else { - qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, + sampling_offset, align_corners, window); } } - else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window); } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp index ceda19f366..cb28f4cb1c 100644 --- a/src/cpu/kernels/scale/sve/fp16.cpp +++ b/src/cpu/kernels/scale/sve/fp16.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -41,8 +42,12 @@ namespace arm_compute { namespace { -void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -61,38 +66,50 @@ void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); } +} // namespace namespace cpu { -void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -103,4 +120,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co } } // namespace cpu } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp index f3472f1efd..cbb345edbb 100644 --- a/src/cpu/kernels/scale/sve/fp32.cpp +++ b/src/cpu/kernels/scale/sve/fp32.cpp @@ -25,23 +25,27 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" +#include <arm_sve.h> #include <cmath> #include <cstddef> -#include <arm_sve.h> - namespace arm_compute { namespace { -void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void fp32_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *off const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<float *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Store results + svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + out); } +} // namespace namespace cpu { -void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void fp32_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp index 82c70ee360..df950b1789 100644 --- a/src/cpu/kernels/scale/sve/integer.cpp +++ b/src/cpu/kernels/scale/sve/integer.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -39,8 +40,12 @@ namespace arm_compute { namespace { -void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void u8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -59,32 +64,40 @@ void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offse const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } -void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void s16_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -103,38 +116,50 @@ void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offs const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); - - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - out); -} + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + out); } +} // namespace namespace cpu { -void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void u8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } @@ -144,12 +169,20 @@ void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, cons } } -void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void s16_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h index b9c3a10a78..aff741a4a7 100644 --- a/src/cpu/kernels/scale/sve/list.h +++ b/src/cpu/kernels/scale/sve/list.h @@ -28,10 +28,10 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SCALE_KERNEL(func_name) \ - void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ - bool align_corners, const Window &window) +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \ + float sampling_offset, bool align_corners, const Window &window) DECLARE_SCALE_KERNEL(fp16_sve_scale); DECLARE_SCALE_KERNEL(fp32_sve_scale); diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp index d45a69e43b..0fc794c6c2 100644 --- a/src/cpu/kernels/scale/sve/qasymm8.cpp +++ b/src/cpu/kernels/scale/sve/qasymm8.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -40,8 +40,12 @@ namespace arm_compute { namespace { -void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void qasymm8_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor * const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } +} // namespace namespace cpu { -void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp index 67bca65f58..68ea01e29e 100644 --- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" + +#include "src/core/helpers/ScaleHelpers.h" #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/helpers/ScaleHelpers.h" -#include "src/core/helpers/ScaleHelpers.h" #include "src/core/utils/ScaleUtils.h" #include "support/Rounding.h" @@ -40,8 +40,12 @@ namespace arm_compute { namespace { -void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, - float sampling_offset, bool align_corners, const Window &window) +void qasymm8_signed_sve_scale_nearest(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + float sampling_offset, + bool align_corners, + const Window &window) { const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; @@ -60,38 +64,50 @@ void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const IT const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; - execute_window_loop(win, [&](const Coordinates & id) - { - const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; - const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); - const int offset_row = in_hi * in_stride_wc; - const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); - const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b8(x, window_end_x); - do + execute_window_loop( + win, + [&](const Coordinates &id) { - // Store results - svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + const int32_t offset = + *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>( + align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) + : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr()); - x += svcntw(); - pg = svwhilelt_b8(x, window_end_x); - } - while(svptest_any(svptrue_b8(), pg)); - }, - out); -} + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } while (svptest_any(svptrue_b8(), pg)); + }, + out); } +} // namespace namespace cpu { -void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, - InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, - bool align_corners, const Window &window) +void qasymm8_signed_sve_scale(const ITensor *src, + ITensor *dst, + const ITensor *offsets, + const ITensor *dx, + const ITensor *dy, + InterpolationPolicy policy, + BorderMode border_mode, + PixelValue constant_border_value, + float sampling_offset, + bool align_corners, + const Window &window) { ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value); - if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + if (policy == InterpolationPolicy::NEAREST_NEIGHBOR) { qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); } diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp index b460213c72..38a58099bd 100644 --- a/src/cpu/kernels/select/generic/neon/fp16.cpp +++ b/src/cpu/kernels/select/generic/neon/fp16.cpp @@ -23,20 +23,22 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) -#include "src/cpu/kernels/select/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window); } -void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<float16_t>(c, x, y, output, window); } @@ -45,4 +47,4 @@ void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITe } // namespace arm_compute -#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp index 63fd594901..50a80cb338 100644 --- a/src/cpu/kernels/select/generic/neon/fp32.cpp +++ b/src/cpu/kernels/select/generic/neon/fp32.cpp @@ -22,20 +22,22 @@ * SOFTWARE. */ -#include "src/cpu/kernels/select/generic/neon/impl.h" - #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/select/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32<float, uint32x4_t>(c, x, y, output, window); } -void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_f32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<float>(c, x, y, output, window); } diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h index 6a6d9969f8..7ce640b6ff 100644 --- a/src/cpu/kernels/select/generic/neon/impl.h +++ b/src/cpu/kernels/select/generic/neon/impl.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H #include "arm_compute/core/TensorInfo.h" + #include "src/core/NEON/NEAsymm.h" #include "src/cpu/kernels/select/generic/neon/impl.h" @@ -37,8 +38,16 @@ namespace arm_compute namespace cpu { template <typename ScalarType, typename VectorType> -void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *)) +void select_op(const ITensor *cond, + const ITensor *in1, + const ITensor *in2, + ITensor *out, + const Window &window, + const int window_step_x, + const int window_start_x, + const int window_end_x, + const int limit, + VectorType (*condition_conversion)(const uint8_t *)) { Window win = window; win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -48,30 +57,32 @@ void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITen Iterator input2(in2, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr()); - const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); - - int x = window_start_x; - for(; x <= limit; x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const auto c = (*condition_conversion)(condition_ptr + x); - const auto a = wrapper::vloadq(input1_ptr + x); - const auto b = wrapper::vloadq(input2_ptr + x); - wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); - } - for(; x < window_end_x; ++x) - { - const auto c = *(condition_ptr + x); - const auto a = *(input1_ptr + x); - const auto b = *(input2_ptr + x); - *(output_ptr + x) = static_cast<bool>(c) ? a : b; - } - }, - condition, input1, input2, output); + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + + int x = window_start_x; + for (; x <= limit; x += window_step_x) + { + const auto c = (*condition_conversion)(condition_ptr + x); + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b)); + } + for (; x < window_end_x; ++x) + { + const auto c = *(condition_ptr + x); + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = static_cast<bool>(c) ? a : b; + } + }, + condition, input1, input2, output); } template <typename ScalarType, typename VectorType> @@ -81,11 +92,14 @@ void select_op_8(const ITensor *cond, const ITensor *in1, const ITensor *in2, IT const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); - }); + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero); + }); } template <typename ScalarType, typename VectorType> @@ -95,11 +109,14 @@ void select_op_16(const ITensor *cond, const ITensor *in1, const ITensor *in2, I const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); - }); + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero); + }); } template <typename ScalarType, typename VectorType> @@ -109,15 +126,19 @@ void select_op_32(const ITensor *cond, const ITensor *in1, const ITensor *in2, I const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType - { - static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag()); - return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); - }); + select_op<ScalarType, VectorType>( + cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, + [](const uint8_t *condition_ptr) -> VectorType + { + static const auto zero = + wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag()); + return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero); + }); } template <typename ScalarType> -void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +void select_op_not_same_rank( + const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) { ARM_COMPUTE_UNUSED(window); @@ -131,20 +152,20 @@ void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITen int offset = 0; const int step = 16 / in1->info()->element_size(); - for(int i = 0; i < outer_size; ++i) + for (int i = 0; i < outer_size; ++i) { int x = offset; const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr; - for(; x <= offset + inner_size - step; x += step) + for (; x <= offset + inner_size - step; x += step) { wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x)); } - if(x <= offset + inner_size - (step / 2)) + if (x <= offset + inner_size - (step / 2)) { wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x)); x += step / 2; } - for(; x < offset + inner_size; ++x) + for (; x < offset + inner_size; ++x) { *(output_ptr + x) = *(input_ptr + x); } diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp index 71b2f0b933..135087c261 100644 --- a/src/cpu/kernels/select/generic/neon/integer.cpp +++ b/src/cpu/kernels/select/generic/neon/integer.cpp @@ -25,59 +25,71 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include <arm_neon.h> - #include "src/cpu/kernels/select/generic/neon/impl.h" +#include <arm_neon.h> + namespace arm_compute { namespace cpu { -void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window); } -void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window); } -void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window); } -void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<int8_t>(c, x, y, output, window); } -void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<int16_t>(c, x, y, output, window); } -void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_s32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<int32_t>(c, x, y, output, window); } -void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u8_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window); } -void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u16_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window); } -void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u32_select_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window); } -void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u8_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<uint8_t>(c, x, y, output, window); } -void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u16_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<uint16_t>(c, x, y, output, window); } -void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) +void neon_u32_select_not_same_rank( + const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window) { return select_op_not_same_rank<uint32_t>(c, x, y, output, window); } diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp index f6556696b0..2e2adf33e0 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp @@ -23,6 +23,7 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/neon/impl.h" @@ -30,8 +31,13 @@ namespace arm_compute { namespace cpu { -void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); } @@ -40,6 +46,6 @@ void neon_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float16_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp index ddd270ae70..61df40c1b5 100644 --- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<float>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp index f07fd2fb27..5d6e6a4f80 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.cpp +++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/kernels/softmax/generic/neon/impl.h" + #include "support/SaturateCast.h" namespace arm_compute @@ -32,11 +33,10 @@ template void neon_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *o template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { - static_assert(std::is_same<T, qasymm8_t>::value - || std::is_same<T, qasymm8_signed_t>::value, + static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value, "quantized type should be either qasymm8_t or qasymm8_signed_t."); const int start_x = in->info()->valid_region().anchor.x(); @@ -50,163 +50,174 @@ void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi Iterator out_it(out, window); constexpr int vec_size = 16; - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); - - float sum{}; - float sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); - /* Init sum to zero */ - float32x4x4_t vec_sum = - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - vdupq_n_f32(0.f), - }; - - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vqsub(vec_max, vec_elements); - auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + float sum{}; + float sum_inversed{}; - if(is_log) - { - vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); - vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); - vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); - vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); - } - else + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); - vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); - vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); - vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); - vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); - vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); - vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); - vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if (is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); } - vst4q_f32(tmp_ptr + x, vec_elements_flt); - } + /* Reduce sum */ + const auto sum_16_byte = + vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); - /* Reduce sum */ - const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); - auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); - sum_res = vpadd_f32(sum_res, sum_res); - sum = wrapper::vgetlane(sum_res, 0); + /* Run remaining elements */ + for (; x < input_width; ++x) + { + float element{}; + if (is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - float element{}; - if(is_log) + tmp_ptr[x] = element; + } + + if (!is_log) { - element = (max_val - in_ptr[x]) * scale_beta; - sum += std::exp(element); + sum_inversed = 256.f / sum; } else { - element = std::exp((max_val - in_ptr[x]) * scale_beta); - sum += element; + sum = std::log(sum); } - - tmp_ptr[x] = element; } - if(!is_log) - { - sum_inversed = 256.f / sum; - } - else + /* Normalize exponentials */ { - sum = std::log(sum); - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; - float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); - int_vec_type normalized_value{}; - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - const float32x4x4_t sub = + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if (is_log) { - vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), - vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), - }; - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + const float32x4x4_t sub = { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if (is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - float32x4x4_t mul = + if (is_log) { - vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), - vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), - }; - - if(is_qasymm8_signed) + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + } + else { - const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); - mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); - mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); - mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); - mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - + (is_qasymm8_signed ? 128.f : 0)); } - - normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); - } - wrapper::vstore(out_ptr + x, normalized_value); - } - /* Run remaining elements */ - for(; x < input_width; ++x) - { - if(is_log) - { - out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); - } - else - { - out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h index 206d36a2e0..4d9b789297 100644 --- a/src/cpu/kernels/softmax/generic/neon/impl.h +++ b/src/cpu/kernels/softmax/generic/neon/impl.h @@ -25,6 +25,7 @@ #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H #include "arm_compute/core/Helpers.h" + #include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" @@ -42,53 +43,65 @@ void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); const int sum_stages = log2(window_step_x / 2); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); - const auto out_ptr = reinterpret_cast<T *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output.ptr()); - // Init max value - auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); - int x = window_start_x; + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto current_value = wrapper::vloadq(in_ptr + x); - vec_max = wrapper::vmax(vec_max, current_value); - } - auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); - for(int i = 0; i < sum_stages; ++i) - { - carry_max = wrapper::vpmax(carry_max, carry_max); - } - T max_val = wrapper::vgetlane(carry_max, 0); + for (int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; - } + // Compute left-over elements + for (; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template <typename T> -void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void neon_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); template <typename T> -void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -103,113 +116,118 @@ void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *c constexpr int vec_size = 16 / sizeof(T); const int sum_stages = log2(vec_size / 2); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<T *>(tmp); - - T sum{}; - T sum_inversed{}; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); - const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - - /* Init sum to zero */ - auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<T *>(tmp); - /* Loop over row and compute exponentials and sum */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_elements = wrapper::vloadq(in_ptr + x); - vec_elements = wrapper::vsub(vec_elements, vec_max); - if(is_log) - { - vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); - vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); - } - else - { - vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); - vec_sum = wrapper::vadd(vec_sum, vec_elements); - } - wrapper::vstore(tmp_ptr + x, vec_elements); - } + T sum{}; + T sum_inversed{}; - /* Reduce sum */ - auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); - for(int i = 0; i < sum_stages; ++i) + /* Compute exponentials and sum */ { - sum_res = wrapper::vpadd(sum_res, sum_res); - } - sum = wrapper::vgetlane(sum_res, 0); + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); - /* Run remaining elements */ - for(; x < input_width; ++x) - { - T element{}; + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + /* Loop over row and compute exponentials and sum */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - element = (in_ptr[x] - max_val) * beta; - sum += std::exp(element); + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if (is_log) + { + vec_elements = + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq( + wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); } - else + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for (int i = 0; i < sum_stages; ++i) { - element = std::exp((in_ptr[x] - max_val) * beta); - sum += element; + sum_res = wrapper::vpadd(sum_res, sum_res); } - tmp_ptr[x] = element; - } + sum = wrapper::vgetlane(sum_res, 0); - if(!is_log) - { - sum_inversed = T(1) / sum; - } - else - { - sum = static_cast<T>(std::log(sum)); - } - } + /* Run remaining elements */ + for (; x < input_width; ++x) + { + T element{}; + + if (is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - for(; x <= (input_width - vec_size); x += vec_size) - { - auto vec_in = wrapper::vloadq(tmp_ptr + x); - auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); - if(is_log) + if (!is_log) { - normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + sum_inversed = T(1) / sum; } else { - normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + sum = static_cast<T>(std::log(sum)); } - wrapper::vstore(out_ptr + x, normalized_value); } - /* Run remaining elements */ - for(; x < input_width; ++x) + + /* Normalize exponentials */ { - if(is_log) + /* Loop over row and compute softmax */ + int x = 0; + for (; x <= (input_width - vec_size); x += vec_size) { - out_ptr[x] = tmp_ptr[x] - sum; + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + if (is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + } + else + { + normalized_value = + wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); } - else + /* Run remaining elements */ + for (; x < input_width; ++x) { - out_ptr[x] = tmp_ptr[x] * sum_inversed; + if (is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } } } - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp index a572891561..40713dc496 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return neon_logits_1d_max<qasymm8_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp index 7d3fe6e046..2c5e284f54 100644 --- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp @@ -22,14 +22,20 @@ * SOFTWARE. */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void neon_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); } @@ -38,5 +44,5 @@ void neon_qasymm8_singed_logits(const ITensor *in, ITensor *out, const Window &w { return neon_logits_1d_max<qasymm8_signed_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp index 15a523bfc9..5e94f72faf 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp @@ -23,14 +23,20 @@ */ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) #include "arm_compute/core/Helpers.h" + #include "src/cpu/CpuTypes.h" #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp16_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window); } @@ -39,6 +45,6 @@ void sve_fp16_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<float16_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp index 55c4aee426..d692cc2477 100644 --- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp +++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp @@ -23,14 +23,20 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute { namespace cpu { -void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_fp32_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window); } @@ -39,5 +45,5 @@ void sve_fp32_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<float>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp index 2340a31cbd..24f1bb8143 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp @@ -23,6 +23,7 @@ */ #include "src/cpu/kernels/softmax/generic/sve/impl.h" + #include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute @@ -36,42 +37,48 @@ void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) const auto window_start_x = static_cast<int>(window.x().start()); const auto window_end_x = static_cast<int>(window.x().end()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input(in, win); Iterator output(out, win); - execute_window_loop(win, [&](const Coordinates &) - { - // Get pointers - const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); - const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); - // Init max value - auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); - int x = window_start_x; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - do - { - const auto current_value = svld1(pg, in_ptr + x); - vec_max = svmax_m(pg, vec_max, current_value); + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); - } - while(svptest_any(all_true_pg, pg)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } while (svptest_any(all_true_pg, pg)); - auto max_val = svmaxv(all_true_pg, vec_max); + auto max_val = svmaxv(all_true_pg, vec_max); - *out_ptr = max_val; - }, - input, output); + *out_ptr = max_val; + }, + input, output); } template <typename ScalarType> -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -82,88 +89,88 @@ void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *co const auto all_true_pg = wrapper::svptrue<ScalarType>(); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); - - ScalarType sum{ 0 }; - - /* Compute exponentials and sum */ + execute_window_loop( + window, + [&](const Coordinates &) { - /* Get max value */ - const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta)); + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); - /* Init sum to zero */ - auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + ScalarType sum{0}; - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do + /* Compute exponentials and sum */ { - auto vec_elements = svld1(pg, in_ptr + x); - vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); - if(!is_log) + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta)); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do { - vec_elements = wrapper::svexp_z(pg, vec_elements); - vec_sum = svadd_m(pg, vec_sum, vec_elements); + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta); + if (!is_log) + { + vec_elements = wrapper::svexp_z(pg, vec_elements); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + if (is_log) + { + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if (is_log) + { + sum = static_cast<ScalarType>(std::log(sum)); } - svst1(pg, tmp_ptr + x, vec_elements); - - if(is_log) + else { - vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + sum = ScalarType(1) / sum; } - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - sum = svaddv(all_true_pg, vec_sum); - - if(is_log) - { - sum = static_cast<ScalarType>(std::log(sum)); - } - else - { - sum = ScalarType(1) / sum; - } - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - do + /* Normalize exponentials */ { - auto vec_in = svld1(pg, tmp_ptr + x); - auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); - if(is_log) - { - normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - else + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do { - normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); - } - svst1(pg, out_ptr + x, normalized_value); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); + if (is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); @@ -171,9 +178,19 @@ template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, cons template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); -template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); -template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float<float>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); +template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h index 4f76ec6a26..89a30d042f 100644 --- a/src/cpu/kernels/softmax/generic/sve/impl.h +++ b/src/cpu/kernels/softmax/generic/sve/impl.h @@ -33,8 +33,13 @@ template <typename ScalarType> void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); template <typename ScalarType> -void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window); +void sve_softmax_logits_1d_float(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp index e9044d5fc9..85e5ccfea1 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_logits(const ITensor *in, ITensor *out, const Window &window) { return sve_logits_1d_max<qasymm8_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp index ab45ce598d..4be2e2eed6 100644 --- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve/impl.h" namespace arm_compute @@ -33,5 +34,5 @@ void sve_qasymm8_signed_logits(const ITensor *in, ITensor *out, const Window &wi { return sve_logits_1d_max<qasymm8_signed_t>(in, out, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp index 8f677c62d4..98b2f5117f 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp @@ -23,7 +23,9 @@ */ #include "src/cpu/kernels/softmax/generic/sve2/impl.h" + #include "arm_compute/core/Types.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -31,8 +33,8 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window) +void sve2_softmax_logits_1d_quantized( + const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window) { const int start_x = in->info()->valid_region().anchor.x(); const int input_width = in->info()->valid_region().shape.x(); @@ -50,162 +52,173 @@ void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, voi const int inc_2 = static_cast<int>(2 * svcntw()); const int inc_3 = static_cast<int>(3 * svcntw()); - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast<float *>(tmp); + execute_window_loop( + window, + [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); - float sum{}; + float sum{}; - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); - const auto vec_max = wrapper::svdup_n(max_val); - - /* Init sum to zero */ - auto vec_sum_0 = svdup_n_f32(0.f); - auto vec_sum_1 = svdup_n_f32(0.f); - auto vec_sum_2 = svdup_n_f32(0.f); - auto vec_sum_3 = svdup_n_f32(0.f); - - /* Loop over row and compute exponentials and sum */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do + /* Compute exponentials and sum */ { - const auto vec_elements = svld1(pg, in_ptr + x); - const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + const auto vec_elements = svld1(pg, in_ptr + x); + const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements)); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub))); - auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub))); - auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub))); - auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub))); + if (is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } - if(is_log) + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), + svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if (is_log) { - vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); - vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); - vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); - vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + sum = std::log(sum); } else { - vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); - vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); - vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); - vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); - vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); - vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); - vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); - vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + sum = 256.f / sum; } - - svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); - svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); - svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); - svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); - - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); } - while(svptest_any(all_true_pg, pg)); - /* Reduce sum */ - const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); - sum = svaddv_f32(all_true_pg, vec_sum); - - /* Run remaining elements */ - x = 0; - if(is_log) - { - sum = std::log(sum); - } - else + /* Normalize exponentials */ { - sum = 256.f / sum; - } - } - - /* Normalize exponentials */ - { - constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; - /* Loop over row and compute softmax */ - int x = 0; - svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); - svbool_t pg_0 = svunpklo(svunpklo(pg)); - svbool_t pg_1 = svunpkhi(svunpklo(pg)); - svbool_t pg_2 = svunpklo(svunpkhi(pg)); - svbool_t pg_3 = svunpkhi(svunpkhi(pg)); - do - { - auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); - auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); - auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); - auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); - - svfloat32_t res_0{}; - svfloat32_t res_1{}; - svfloat32_t res_2{}; - svfloat32_t res_3{}; - - if(is_log) + constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do { - res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); - } - else - { - res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); - res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); - res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); - res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; - if(is_qasymm8_signed) + if (is_log) { - const auto offset_vec = svdup_n_f32(128.f); - res_0 = svsub_z(pg_0, res_0, offset_vec); - res_1 = svsub_z(pg_1, res_1, offset_vec); - res_2 = svsub_z(pg_2, res_2, offset_vec); - res_3 = svsub_z(pg_3, res_3, offset_vec); + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if (is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, res_0, offset_vec); + res_1 = svsub_z(pg_1, res_1, offset_vec); + res_2 = svsub_z(pg_2, res_2, offset_vec); + res_3 = svsub_z(pg_3, res_3, offset_vec); + } } - } - // Store value - const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); - svst1(pg, out_ptr + x, out); - x += wrapper::svcnt<ScalarType>(); - pg = wrapper::svwhilelt<ScalarType>(x, input_width); - pg_0 = svunpklo(svunpklo(pg)); - pg_1 = svunpkhi(svunpklo(pg)); - pg_2 = svunpklo(svunpkhi(pg)); - pg_3 = svunpkhi(svunpkhi(pg)); + // Store value + const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } while (svptest_any(all_true_pg, pg)); } - while(svptest_any(all_true_pg, pg)); - } - }, - in_it, max_it, out_it); + }, + in_it, max_it, out_it); } -template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); -template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); +template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h index abbcc15181..33fcc26cda 100644 --- a/src/cpu/kernels/softmax/generic/sve2/impl.h +++ b/src/cpu/kernels/softmax/generic/sve2/impl.h @@ -31,8 +31,13 @@ namespace arm_compute namespace cpu { template <typename ScalarType> -void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, float beta, bool is_log, const Window &window); +void sve2_softmax_logits_1d_quantized(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + float beta, + bool is_log, + const Window &window); } // namespace cpu } // namespace arm_compute #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */ diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp index 810035eb9c..95623786b3 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp index 283b55e9ce..c20462fcef 100644 --- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp +++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp @@ -23,16 +23,22 @@ */ #include "arm_compute/core/Helpers.h" + #include "src/cpu/kernels/softmax/generic/sve2/impl.h" namespace arm_compute { namespace cpu { -void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp, - ITensor *out, const float beta, bool is_log, const Window &window) +void sve2_qasymm8_signed_softmax(const ITensor *in, + const ITensor *max, + void *const tmp, + ITensor *out, + const float beta, + bool is_log, + const Window &window) { return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window); } -} +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h index ed3515f417..627ce0c264 100644 --- a/src/cpu/kernels/softmax/list.h +++ b/src/cpu/kernels/softmax/list.h @@ -28,9 +28,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_SOFTMAX_KERNEL(func_name) \ - void func_name(const ITensor *in, const ITensor *max, void *const tmp, \ - ITensor *out, const float beta, bool is_log, const Window &window) +#define DECLARE_SOFTMAX_KERNEL(func_name) \ + void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \ + bool is_log, const Window &window) DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax); DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax); @@ -43,8 +43,7 @@ DECLARE_SOFTMAX_KERNEL(sve2_qasymm8_softmax); #undef DECLARE_SOFTMAX_KERNEL -#define DECLARE_LOGITS_KERNEL(func_name) \ - void func_name(const ITensor *in, ITensor *out, const Window &window) +#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window) DECLARE_LOGITS_KERNEL(neon_fp32_logits); DECLARE_LOGITS_KERNEL(neon_fp16_logits); diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h index f7e1a040bd..9f6c92271f 100644 --- a/src/cpu/kernels/sub/neon/list.h +++ b/src/cpu/kernels/sub/neon/list.h @@ -26,14 +26,16 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { -#define DECLARE_SUB_KERNEL(func_name) \ - void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +#define DECLARE_SUB_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \ + const Window &window) DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint); DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint); @@ -44,7 +46,8 @@ DECLARE_SUB_KERNEL(sub_qsymm16_neon); #undef DECLARE_SUB_KERNEL template <typename T> -void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_same_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; @@ -68,7 +71,7 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); Iterator output(dst, window); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -84,41 +87,44 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); - const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); - if(is_broadcast_input_2) + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{})); + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) + : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + if (is_broadcast_input_2) + { + res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{})); + } + wrapper::vstore(output_ptr + x, res); } - wrapper::vstore(output_ptr + x, res); - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto non_broadcast_v = *(non_broadcast_input_ptr + x); - auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; - if(is_broadcast_input_2) + // Compute left-over elements + for (; x < window_end_x; ++x) { - res = static_cast<T>(-1) * res; + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = + is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; + if (is_broadcast_input_2) + { + res = static_cast<T>(-1) * res; + } + + *(output_ptr + x) = res; } - - *(output_ptr + x) = res; - } - }, - broadcast_input, non_broadcast_input, output); + }, + broadcast_input, non_broadcast_input, output); } else { @@ -131,31 +137,32 @@ void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const Iterator output(dst, win); execute_window_loop( - win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<T *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto val1 = wrapper::vloadq(input1_ptr + x); - const auto val2 = wrapper::vloadq(input2_ptr + x); - const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) + win, + [&](const Coordinates &) { - const auto val1 = *(input1_ptr + x); - const auto val2 = *(input2_ptr + x); - *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; - } - }, - input1, input2, output); + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; + } + }, + input1, input2, output); } } } // namespace cpu diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp index ea6e5826dd..b750afce6e 100644 --- a/src/cpu/kernels/sub/neon/qasymm8.cpp +++ b/src/cpu/kernels/sub/neon/qasymm8.cpp @@ -23,21 +23,24 @@ */ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/); } -void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp index a86c7f22f6..fb0bb62682 100644 --- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp +++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp @@ -24,21 +24,24 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/cpu/kernels/add/generic/neon/impl.h" namespace arm_compute { namespace cpu { -void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_signed_neon_fixedpoint( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/); } -void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qasymm8_signed_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp index 4dfdc0e78c..23e4b03843 100644 --- a/src/cpu/kernels/sub/neon/qsymm16.cpp +++ b/src/cpu/kernels/sub/neon/qsymm16.cpp @@ -25,14 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/Traits.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" namespace arm_compute { namespace cpu { -void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +void sub_qsymm16_neon( + const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) { ARM_COMPUTE_UNUSED(policy); @@ -57,7 +59,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; @@ -65,7 +67,7 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); - const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); // Clear X Dimension on execution window as we handle manually non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -74,61 +76,62 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); - const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); - const float32x4x2_t bf = - { - { - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), - vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), - } - }; - const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + const float32x4x2_t bf = {{ + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + }}; + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); - const float32x4x2_t af = + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const float32x4x2_t af = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; + }}; - const int32x4x4_t rf = - { - { + const int32x4x4_t rf = {{ #ifdef __aarch64__ - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), #else //__aarch64__ - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), - vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) + : vsubq_f32(af.val[0], bf.val[0]), + invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) + : vsubq_f32(af.val[1], bf.val[1]), + invvscaleo)), #endif //__aarch64__ - } - }; + }}; - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; - *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); - } - }, - broadcast_input, non_broadcast_input, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); } else { @@ -140,38 +143,32 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co Iterator input2(src1, input2_win); Iterator output(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); - const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); - const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &) { - const int16x8_t a = vld1q_s16(input1_ptr + x); - const int16x8_t b = vld1q_s16(input2_ptr + x); + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); - const float32x4x2_t af = + // Compute S elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const float32x4x2_t af = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), - } - }; + }}; - const float32x4x2_t bf = - { - { + const float32x4x2_t bf = {{ vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), - } - }; + }}; - const int32x4x2_t rf = - { - { + const int32x4x2_t rf = {{ #ifdef __aarch64__ vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), @@ -179,23 +176,22 @@ void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, co vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), #endif //__aarch64__ - } - }; + }}; - const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); - vst1q_s16(output_ptr + x, pa); - } + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; - const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; - *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); - } - }, - input1, input2, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp index 197e9850b9..44d70cf503 100644 --- a/src/cpu/operators/CpuActivation.cpp +++ b/src/cpu/operators/CpuActivation.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuActivation.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" @@ -42,7 +43,8 @@ void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, con _kernel = std::move(k); } -Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) +Status +CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) { return kernels::CpuActivationKernel::validate(input, output, activation_info); } @@ -54,13 +56,17 @@ void CpuActivation::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && + !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -69,7 +75,7 @@ std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTenso act_op->configure(&src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h index e21fc7d32c..ec442f92c8 100644 --- a/src/cpu/operators/CpuActivation.h +++ b/src/cpu/operators/CpuActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp index 41def8e22f..53cd7fa1b7 100644 --- a/src/cpu/operators/CpuAdd.cpp +++ b/src/cpu/operators/CpuAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuAdd.h" -#include "src/cpu/kernels/CpuAddKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuAddKernel.h" namespace arm_compute { namespace cpu { -void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAdd::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info); @@ -42,7 +45,11 @@ void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAdd::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuAddKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h index db05c100cc..5f60102de2 100644 --- a/src/cpu/operators/CpuAdd.h +++ b/src/cpu/operators/CpuAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -55,14 +56,22 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. * */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp index 590ee482ca..2f19f2f842 100644 --- a/src/cpu/operators/CpuAddMulAdd.cpp +++ b/src/cpu/operators/CpuAddMulAdd.cpp @@ -21,39 +21,49 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include "src/cpu/operators/CpuAddMulAdd.h" + #include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuAddMulAddKernel.h" -#include "src/cpu/operators/CpuAddMulAdd.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" namespace arm_compute { namespace cpu { -void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuAddMulAdd::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); auto k = std::make_unique<kernels::CpuAddMulAddKernel>(); const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul); _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add); - k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info); + k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, + act_info); // Save auxilary memory requirements after configuration - _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size()); - _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size()); + _aux_mem[DequantizedBnMul] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, + _dequantized_bn_mul.total_size()); + _aux_mem[DequantizedBnAdd] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, + _dequantized_bn_add.total_size()); } else { @@ -63,13 +73,17 @@ void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input _kernel = std::move(k); } -Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { const DataType data_type = input1->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32); TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32); @@ -77,11 +91,13 @@ Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *inpu ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul)); ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add)); - return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, + add_output, final_output, policy, act_info); } else { - return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, + act_info); } } @@ -89,37 +105,32 @@ void CpuAddMulAdd::run(ITensorPack &tensors) { const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); - CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true); - CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true); + CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, + true); + CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, + true); - ITensorPack dequantize_mul_pack = - { - { TensorType::ACL_SRC_0, bn_mul }, - { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() } - }; + ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul}, + {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}}; - ITensorPack dequantize_add_pack = - { - { TensorType::ACL_SRC_0, bn_add }, - { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() } - }; + ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add}, + {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}}; _dequantize_bn_mul.run(dequantize_mul_pack); _dequantize_bn_add.run(dequantize_add_pack); - ITensorPack add_mul_add_pack = - { - { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) }, - { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) }, - { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() }, - { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() }, - { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) }, - { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) }, + ITensorPack add_mul_add_pack = { + {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)}, + {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)}, + {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()}, + {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()}, + {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)}, + {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)}, }; NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack); diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h index cf1ece68f1..47db75c37e 100644 --- a/src/cpu/operators/CpuAddMulAdd.h +++ b/src/cpu/operators/CpuAddMulAdd.h @@ -42,20 +42,28 @@ public: * Similar to @ref NEAddMulAdd::configure() * */ - void configure(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - ITensorInfo *add_output, ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuAddMulAdd::configure() * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, - const ITensorInfo *bn_mul, const ITensorInfo *bn_add, - const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -77,7 +85,7 @@ private: TensorInfo _dequantized_bn_mul{}; TensorInfo _dequantized_bn_add{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp index 1cfd8c1d0e..55b9204d71 100644 --- a/src/cpu/operators/CpuCast.cpp +++ b/src/cpu/operators/CpuCast.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCast.h" -#include "src/cpu/kernels/CpuCastKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCastKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp index 4021fd8ded..5f517a8fcb 100644 --- a/src/cpu/operators/CpuConcatenate.cpp +++ b/src/cpu/operators/CpuConcatenate.cpp @@ -23,21 +23,20 @@ */ #include "src/cpu/operators/CpuConcatenate.h" -#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" -#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" -#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" -#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" namespace arm_compute { @@ -59,9 +58,9 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect unsigned int offset = 0; - for(unsigned int i = 0; i < _num_srcs; ++i) + for (unsigned int i = 0; i < _num_srcs; ++i) { - switch(axis) + switch (axis) { case Window::DimX: { @@ -98,16 +97,17 @@ void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vect } } -Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) +Status +CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); unsigned int offset = 0; - for(const auto &src : srcs_vector) + for (const auto &src : srcs_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - switch(axis) + switch (axis) { case Window::DimX: { @@ -135,7 +135,7 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec offset += src->dimension(axis); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -146,18 +146,18 @@ Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vec void CpuConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) + if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h index eb11926b48..c36977c70f 100644 --- a/src/cpu/operators/CpuConcatenate.h +++ b/src/cpu/operators/CpuConcatenate.h @@ -68,8 +68,8 @@ public: private: std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{}; - unsigned int _num_srcs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_srcs{0}; + unsigned int _axis{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp index 16ac16b3ba..19311733db 100644 --- a/src/cpu/operators/CpuConv2d.cpp +++ b/src/cpu/operators/CpuConv2d.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuConv2d.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv2d.h" #include "src/cpu/operators/CpuGemm.h" @@ -35,26 +37,35 @@ namespace arm_compute { namespace cpu { -CpuConv2d::CpuConv2d() - : _function() +CpuConv2d::CpuConv2d() : _function() { } CpuConv2d::~CpuConv2d() = default; -void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuConv2d::configure(ITensorInfo *input, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); + ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: { @@ -92,19 +103,30 @@ void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITenso _aux_mem = _function->workspace(); } -Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: - ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM: - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math)); break; case ConvolutionMethod::GEMM_CONV2D: ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); @@ -120,9 +142,14 @@ Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, return Status{}; } -ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); ARM_COMPUTE_UNUSED(weights_info); @@ -137,35 +164,46 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - const std::vector<ConfigurationMethod> known_configs = - { + const std::vector<ConfigurationMethod> known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U)), + ConvolutionMethod::GEMM), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), + ConfigurationMethod( + ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; + ConfigurationMethod( + ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM)}; const auto find_config = [&](ConfigurationMethod c) { const ConvolutionConfiguration config = c.first; const PadStrideInfo info = std::get<3>(config); - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } @@ -173,43 +211,49 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co { // SRGAN // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) && + (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if(input->dimension(idx_c) < 16) + if (input->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // This heuristics only applies to F16 data type on A55r1 - if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16) + if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && + input->data_type() == DataType::F16) { // Exclude known bad winograd configs (and defaults to GEMM) - const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = - { + const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = { // Squeezenet_V1_1 fire2 and fire3 - ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire6 and fire7 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), + PadStrideInfo(1U, 1U, 1U, 1U)), // Squeezenet_V1_1 fire8 and fire9 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), + PadStrideInfo(1U, 1U, 1U, 1U)), }; const auto find_conv_config = [&](ConvolutionConfiguration c) { const PadStrideInfo info = std::get<3>(c); - return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); + return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); }; - bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(), - find_conv_config) - != known_bad_winograd_f16_with_fastmath_configs.end(); - if(found_bad) + bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), + known_bad_winograd_f16_with_fastmath_configs.end(), + find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end(); + if (found_bad) { return ConvolutionMethod::GEMM; } @@ -217,16 +261,16 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, co #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // For 1x1 convolutions run the default GEMM - if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) { return ConvolutionMethod::GEMM; } - if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::WINOGRAD; } - if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) + if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) { return ConvolutionMethod::GEMM_CONV2D; } diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h index 0908ac0cbb..71b9e15dc1 100644 --- a/src/cpu/operators/CpuConv2d.h +++ b/src/cpu/operators/CpuConv2d.h @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -102,17 +103,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d * * Similar to CpuConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will return the convolution called by @ref CpuConv2d * * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], @@ -132,11 +148,17 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp index 810ffb1e4e..49e31926e3 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp @@ -24,6 +24,7 @@ #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" @@ -31,7 +32,10 @@ namespace arm_compute { namespace cpu { -void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>(); @@ -39,7 +43,10 @@ void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorI _kernel = std::move(k); } -Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } @@ -48,5 +55,5 @@ void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) { NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h index ea70eee134..e208cca3a0 100644 --- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h @@ -41,14 +41,18 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void + configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); // Inherited methods overridden: void run(ITensorPack &tensors) override; }; diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp index 7420ff6240..92c19d4df2 100644 --- a/src/cpu/operators/CpuCopy.cpp +++ b/src/cpu/operators/CpuCopy.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuCopy.h" -#include "src/cpu/kernels/CpuCopyKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp index 884fe5c4ed..54075f2afa 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -24,10 +24,11 @@ #include "src/cpu/operators/CpuDepthwiseConv2d.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -37,11 +38,16 @@ namespace cpu { namespace { -Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status validate_arguments_optimized(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + if (!is_data_type_quantized_per_channel(weights->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } @@ -49,14 +55,17 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > + src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > + src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if (biases != nullptr) { - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int channel_idx = + get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); } @@ -64,7 +73,7 @@ Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *w ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -80,8 +89,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _has_bias = biases != nullptr; @@ -91,10 +100,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI _are_weights_const = weights->are_values_constant(); // Configure pipeline - _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); + _is_activationlayer_enabled = + info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>(); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique<cpu::CpuPermute>(); _permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -128,7 +138,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorI } // Configure activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<cpu::CpuActivation>(); _activationlayer_function->configure(dst, nullptr, info.act_info); @@ -155,7 +165,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute input - if(_permute) + if (_permute) { ITensorPack pack; auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); @@ -166,7 +176,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run assembly function - if(_is_nchw) + if (_is_nchw) { auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -198,7 +208,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Permute output - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -208,7 +218,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &t } // Run activation - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -221,7 +231,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac { // if weights are not constant then we need to repack so that weights // can be updated in-place - if(!_are_weights_const) + if (!_are_weights_const) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -238,14 +248,14 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac return; } - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); // Permute weights - if(_permute) + if (_permute) { auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); @@ -279,11 +289,15 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPac } } -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); _is_nchw = src->data_layout() == DataLayout::NCHW; _is_prepared = !_is_nchw; @@ -294,9 +308,10 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, auto input_perm = std::make_unique<TensorInfo>(); auto weights_perm = std::make_unique<TensorInfo>(); - auto output_perm = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + auto output_perm = std::make_unique<TensorInfo>( + dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - if(_is_nchw) + if (_is_nchw) { _permute_input = std::make_unique<cpu::CpuPermute>(); _permute_weights = std::make_unique<cpu::CpuPermute>(); @@ -315,7 +330,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - if(_is_nchw) + if (_is_nchw) { _permute_output = std::make_unique<cpu::CpuPermute>(); _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); @@ -324,43 +339,61 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, //Configure Activation Layer _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<cpu::CpuActivation>(); _activationlayer_function->configure(dst, nullptr, info.act_info); } } -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { TensorShape permuted_input_shape = src->tensor_shape(); TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + TensorShape permuted_output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); + const TensorInfo permuted_input = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NCHW)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, info)); } else { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); } // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); } @@ -375,7 +408,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - if(_is_nchw) + if (_is_nchw) { prepare(tensors); auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); @@ -392,7 +425,8 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } else { @@ -401,10 +435,11 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); } - if(_is_nchw) + if (_is_nchw) { ITensorPack pack; auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); @@ -413,7 +448,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) _permute_output->run(pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -424,7 +459,7 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); @@ -441,12 +476,17 @@ void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors } } -void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +void CpuDepthwiseConv2d::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) { ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); - _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); - switch(_depth_conv_func) + _depth_conv_func = + get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.configure(src, weights, biases, dst, info); @@ -459,10 +499,14 @@ void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, } } -Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) { DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); - switch(depth_conv_func) + switch (depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); @@ -475,10 +519,13 @@ Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *w } } -DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info) { - if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) { return DepthwiseConvolutionFunction::OPTIMIZED; } @@ -490,7 +537,7 @@ DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_functi void CpuDepthwiseConv2d::run(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.run(tensors); @@ -505,7 +552,7 @@ void CpuDepthwiseConv2d::run(ITensorPack &tensors) void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) { - switch(_depth_conv_func) + switch (_depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _func_optimized.prepare(tensors); diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h index 3d8719ee44..7eaa0df857 100644 --- a/src/cpu/operators/CpuDepthwiseConv2d.h +++ b/src/cpu/operators/CpuDepthwiseConv2d.h @@ -24,8 +24,9 @@ #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H -#include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" @@ -56,14 +57,22 @@ public: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d * * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 @@ -76,7 +85,10 @@ public: * * @return a Depthwise Convolution Function */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, const ConvolutionInfo &info); // Inherited methods overriden: @@ -118,32 +130,40 @@ private: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overriden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - bool _are_weights_const{ true }; + std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _has_bias{false}; + bool _is_quantized{false}; + bool _is_nchw{true}; + bool _permute{false}; + bool _is_activationlayer_enabled{false}; + bool _is_prepared{false}; + bool _are_weights_const{true}; }; /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: @@ -176,7 +196,11 @@ private: * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. * @param[in] info Depthwise convolution meta-data. */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * @@ -184,24 +208,28 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; void prepare(ITensorPack &tensors) override; private: - std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr }; - std::unique_ptr<CpuPermute> _permute_input{ nullptr }; - std::unique_ptr<CpuPermute> _permute_weights{ nullptr }; - std::unique_ptr<CpuPermute> _permute_output{ nullptr }; - std::unique_ptr<CpuActivation> _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; + std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _is_nchw{true}; + bool _is_prepared{false}; + bool _is_activationlayer_enabled{false}; }; - DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC }; + DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC}; CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; CpuDepthwiseConv2dGeneric _func_generic{}; }; diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp index d078155155..8d3741de96 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -38,15 +39,14 @@ namespace cpu { struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl { - std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr }; - bool is_prepared{ false }; - bool are_weights_const{ true }; + std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr}; + bool is_prepared{false}; + bool are_weights_const{true}; experimental::MemoryRequirements mem_req{}; }; #ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() - : _pImpl(std::make_unique<LocalImpl>()) +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>()) { } #endif /* DOXYGEN_SKIP_THIS */ @@ -66,7 +66,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, _pImpl->are_weights_const = weights->are_values_constant(); // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) { return; } @@ -77,12 +77,16 @@ void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, // Compute memory requirements for assembly kernels constexpr size_t alignment = 4096; - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment }); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment }); + _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment}); + _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment}); _pImpl->asm_kernel = std::move(dwc_wrapper); } -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) { return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); } @@ -111,7 +115,7 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) { const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) + if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) { // Pack weights and bias const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -125,11 +129,12 @@ void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) const auto weights_padding = weights->info()->padding(); const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; - const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + const size_t ld_weights_row = + ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); weights->mark_as_unused(); - if(bias != nullptr) + if (bias != nullptr) { bias->mark_as_unused(); } diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h index f222ab9cf9..f1816625d2 100644 --- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -53,14 +54,22 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src. * @param[in] info Depthwise convolution meta-data. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); /** Checks if activation is supported by the assembly kernels * * @param[in] activation Activation to check @@ -70,8 +79,8 @@ public: static bool is_activation_supported(const ActivationLayerInfo &activation); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp index 12dc136ba3..c05a23f3a7 100644 --- a/src/cpu/operators/CpuDequantize.cpp +++ b/src/cpu/operators/CpuDequantize.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuDequantizeKernel.h" diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp index 9cdbdb61c1..135a3bb2b9 100644 --- a/src/cpu/operators/CpuDirectConv2d.cpp +++ b/src/cpu/operators/CpuDirectConv2d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,12 +37,25 @@ namespace cpu CpuDirectConv2d::~CpuDirectConv2d() = default; CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() + : _memory_group(std::move(memory_manager)), + _output_stage_kernel(), + _conv_kernel(), + _input_border_handler(), + _activationlayer_function(), + _accumulator(), + _has_bias(false), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ), + _is_padding_required() { } -void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void CpuDirectConv2d::configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info); @@ -51,7 +65,7 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _input_border_handler = std::make_unique<NEFillBorderKernel>(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -62,28 +76,33 @@ void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const IT _has_bias = (bias != nullptr); _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) + if (_has_bias) { _output_stage_kernel->configure(dst, bias); } _is_padding_required = !_conv_kernel->border_size().empty(); - if(_is_padding_required) + if (_is_padding_required) { // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f))); + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, + PixelValue(static_cast<float>(0.f))); } //Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<CpuActivation>(); _activationlayer_function->configure(dst, dst, act_info); } } -Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, +Status CpuDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); @@ -95,7 +114,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), @@ -106,7 +125,7 @@ Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weig // Validate bias kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); } @@ -122,14 +141,15 @@ void CpuDirectConv2d::run(ITensorPack &tensors) auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); auto dst = tensors.get_tensor(TensorType::ACL_DST); - if(_is_padding_required) + if (_is_padding_required) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), + pack); } NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) + if (_has_bias) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, dst); @@ -138,7 +158,7 @@ void CpuDirectConv2d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); } - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h index fa8d61e083..73c85f2dcd 100644 --- a/src/cpu/operators/CpuDirectConv2d.h +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -24,13 +24,14 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H #define ARM_COMPUTE_CPU_DIRECTCONV2D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -75,14 +76,23 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -95,10 +105,10 @@ private: std::unique_ptr<NEFillBorderKernel> _input_border_handler; std::unique_ptr<CpuActivation> _activationlayer_function; Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; + bool _has_bias{false}; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; + bool _is_padding_required{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp index aa74e420a6..626f1c6775 100644 --- a/src/cpu/operators/CpuDirectConv3d.cpp +++ b/src/cpu/operators/CpuDirectConv3d.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -36,11 +37,17 @@ namespace cpu CpuDirectConv3d::~CpuDirectConv3d() = default; CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager) - : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ) + : _memory_group(std::move(memory_manager)), + _conv_kernel(), + _activationlayer_function(), + _accumulator(), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ) { } -void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) +void CpuDirectConv3d::configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) { ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info); ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); @@ -48,7 +55,7 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>(); // Free accumulator - if(_accumulator.buffer() != nullptr) + if (_accumulator.buffer() != nullptr) { _accumulator.allocator()->free(); } @@ -59,21 +66,25 @@ void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITen //Configure Activation Layer _is_activationlayer_enabled = conv_info.act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activationlayer_function = std::make_unique<CpuActivation>(); _activationlayer_function->configure(dst, dst, conv_info.act_info); } } -Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info) +Status CpuDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); // Validate Convolution kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info)); - if(conv_info.act_info.enabled()) + if (conv_info.act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info)); } @@ -89,7 +100,7 @@ void CpuDirectConv3d::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, dst); @@ -98,4 +109,4 @@ void CpuDirectConv3d::run(ITensorPack &tensors) } } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h index cde01f07c2..3ad1e09a14 100644 --- a/src/cpu/operators/CpuDirectConv3d.h +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -24,14 +24,15 @@ #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H #define ARM_COMPUTE_CPU_DIRECTCONV3D_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -76,14 +77,19 @@ public: * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. * @param[in] conv_info Contains padding, stride, acitvation information. */ - void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); + void configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuDirectConv3d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -93,8 +99,8 @@ private: std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel; std::unique_ptr<CpuActivation> _activationlayer_function; Tensor _accumulator; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp index b88ae3e514..c2ae8773c6 100644 --- a/src/cpu/operators/CpuElementwise.cpp +++ b/src/cpu/operators/CpuElementwise.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuElementwise.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/CpuElementwiseKernel.h" @@ -33,7 +34,7 @@ namespace cpu void CpuElementwiseBase::run(ITensorPack &tensors) { // If the kernel has been configured, use the window from the kernel. - if(_kernel->is_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -101,12 +102,16 @@ void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, con } template <ComparisonOperation COP> -Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +Status +CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) { return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); } -void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op) +void CpuElementwiseComparison::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ComparisonOperation op) { ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); auto k = std::make_unique<kernels::CpuComparisonKernel>(); @@ -114,7 +119,10 @@ void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorI _kernel = std::move(k); } -Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op) +Status CpuElementwiseComparison::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ComparisonOperation op) { return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); } @@ -127,4 +135,4 @@ template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual> template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>; template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h index b6c61cf245..5db53c8026 100644 --- a/src/cpu/operators/CpuElementwise.h +++ b/src/cpu/operators/CpuElementwise.h @@ -139,7 +139,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); }; /** Basic function to run @ref cpu::kernels::CpuComparisonKernel @@ -182,4 +183,4 @@ using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqua } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp index 7fd14dba7d..04ab7bf8f5 100644 --- a/src/cpu/operators/CpuElementwiseUnary.cpp +++ b/src/cpu/operators/CpuElementwiseUnary.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuElementwiseUnary.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" @@ -47,7 +48,7 @@ Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src void CpuElementwiseUnary::run(ITensorPack &tensors) { - if(_kernel->is_window_configured()) + if (_kernel->is_window_configured()) { ICpuOperator::run(tensors); return; @@ -57,4 +58,4 @@ void CpuElementwiseUnary::run(ITensorPack &tensors) ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h index 5e8e98d047..1e51bfaa1c 100644 --- a/src/cpu/operators/CpuElementwiseUnary.h +++ b/src/cpu/operators/CpuElementwiseUnary.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -56,4 +57,4 @@ public: } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp index 3d8f62fe07..1890d0b916 100644 --- a/src/cpu/operators/CpuFill.cpp +++ b/src/cpu/operators/CpuFill.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFill.h" -#include "src/cpu/kernels/CpuFillKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFillKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h index 41d9a9fa8a..cb83745d29 100644 --- a/src/cpu/operators/CpuFill.h +++ b/src/cpu/operators/CpuFill.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_FILL_H #include "arm_compute/core/PixelValue.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp index 7bab9e481c..2609d44590 100644 --- a/src/cpu/operators/CpuFlatten.cpp +++ b/src/cpu/operators/CpuFlatten.cpp @@ -23,16 +23,14 @@ */ #include "src/cpu/operators/CpuFlatten.h" -#include "src/cpu/operators/CpuReshape.h" - #include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuReshape.h" namespace arm_compute { namespace cpu { -CpuFlatten::CpuFlatten() - : _reshape(nullptr) +CpuFlatten::CpuFlatten() : _reshape(nullptr) { } diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp index 868add7d29..a107393b01 100644 --- a/src/cpu/operators/CpuFloor.cpp +++ b/src/cpu/operators/CpuFloor.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuFloor.h" -#include "src/cpu/kernels/CpuFloorKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFloorKernel.h" namespace arm_compute { diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp index 395d8d2aa5..85a0b0311b 100644 --- a/src/cpu/operators/CpuFullyConnected.cpp +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -25,10 +25,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" @@ -49,8 +50,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -62,10 +66,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -78,14 +83,22 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format) +Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + bool enable_fast_math, + WeightFormat weight_format) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); GEMMLowpOutputStageInfo gemmlowp_output_stage_info; ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); @@ -97,11 +110,8 @@ Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITe // Validate gemmlowp function TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info, - &weights_info, - biases, - dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info)); } else { @@ -142,21 +152,28 @@ CpuFullyConnected::CpuFullyConnected() CpuFullyConnected::~CpuFullyConnected() = default; -void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); // Configure gemmlowp function and output stage for asymmetric quantized types GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); + const Status status = + get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); GEMMInfo gemm_info; @@ -179,7 +196,11 @@ void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo * } } -void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); @@ -195,7 +216,11 @@ void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorI configure_mm(&_flattened_src, weights, biases, dst, act); } -void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) +void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) { ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); @@ -203,17 +228,17 @@ void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInf configure_mm(src, weights, biases, dst, act); } -void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void CpuFullyConnected::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - fc_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); _needs_weights_conversion = false; @@ -238,9 +263,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -248,7 +275,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Reshape weights if needed - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Reshape the weights _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>(); @@ -260,13 +287,11 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>(); - _convert_weights->configure(weights_to_use, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); _converted_weights.set_are_values_constant(weights_to_use->are_values_constant()); @@ -275,7 +300,7 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei _trans_weights_idx = AuxTensorIdx::ConvertedWeights; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); @@ -287,54 +312,57 @@ void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *wei } // Retain the tensorinfo with the weights to use - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { _trans_weights = *weights_to_use; } // Set auxiliary memory requirements auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time. _aux_mem[TransposedWeights] = MemoryInfo( offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent : - MemoryLifetime::Prepare, + _dynamic_weights ? MemoryLifetime::Temporary + : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare, _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : - _needs_weights_conversion ? MemoryLifetime::Prepare : - MemoryLifetime::Persistent, - _reshaped_weights.total_size()); + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : _needs_weights_conversion ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent, + _reshaped_weights.total_size()); _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, + offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, _converted_weights.total_size()); } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info) +Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info) { GEMMInfo gemm_info; gemm_info.set_activation_info(fc_info.activation_info); @@ -345,12 +373,17 @@ Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weigh return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status CpuFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); if (is_fixed_format_fast_math(weights_info.weight_format())) { @@ -364,15 +397,22 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; bool is_fc_after_conv = true; - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); + const ITensorInfo &flatten_src = + TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped + ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -386,10 +426,10 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -399,36 +439,37 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we } } - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { is_fc_after_conv = src->num_dimensions() > 1; } - if(!weights_reshaped) + if (!weights_reshaped) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); @@ -440,7 +481,8 @@ Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *we ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); } // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format())); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, + fc_info.enable_fast_math, weights_info.weight_format())); return Status{}; } @@ -460,21 +502,21 @@ void CpuFullyConnected::run(ITensorPack &tensors) CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); // Linearize src if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_needs_weights_reshape || _needs_weights_conversion) + if (_needs_weights_reshape || _needs_weights_conversion) { gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); } // Run matrix multiply - if(_is_quantized_asymmetric) + if (_is_quantized_asymmetric) { _mm_gemmlowp->run(gemm_pack); } @@ -486,7 +528,7 @@ void CpuFullyConnected::run(ITensorPack &tensors) void CpuFullyConnected::prepare(ITensorPack &tensors) { - if(!_is_prepared || _dynamic_weights) + if (!_is_prepared || _dynamic_weights) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -502,20 +544,21 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape of the weights (happens only once) - if(_needs_weights_reshape) + if (_needs_weights_reshape) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; - NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), + transpose_pack); cur_weights->mark_as_unused(); cur_weights = reshaped_weights.get(); } // Convert weights if needed (happens only once) - if(_needs_weights_conversion) + if (_needs_weights_conversion) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -526,7 +569,7 @@ void CpuFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) + if (!_is_quantized_asymmetric) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h index 1e8c6478d0..7073fb9f7c 100644 --- a/src/cpu/operators/CpuFullyConnected.h +++ b/src/cpu/operators/CpuFullyConnected.h @@ -24,11 +24,11 @@ #ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H #define ARM_COMPUTE_CPU_FULLY_CONNECTED_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/FullyConnectedLayerInfo.h" +#include "src/cpu/ICpuOperator.h" + #include <memory> namespace arm_compute @@ -86,16 +86,24 @@ public: * @param[in] fc_info (Optional) Fully connected layer additional info * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected * * Similar to @ref CpuFullyConnected::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo()); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same @@ -103,19 +111,35 @@ public: * * @return a status */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info, WeightsInfo weights_info); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info); //Inherited methods override - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); + void configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); enum AuxTensorIdx { diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp index 34b845928d..8da166dbef 100644 --- a/src/cpu/operators/CpuGemm.cpp +++ b/src/cpu/operators/CpuGemm.cpp @@ -24,9 +24,10 @@ #include "src/cpu/operators/CpuGemm.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -57,17 +58,25 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) } } // namespace -void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +void CpuGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info); - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool is_c_bias = beta == 1 && c != nullptr; - bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = beta == 1 && c != nullptr; + bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. // Check if we need to reshape the matrix B only on the first run _is_prepared = false; @@ -76,9 +85,12 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _run_alpha_scale = alpha != 1.f; _run_bias_addition = is_c_bias; _run_addition = beta != 0 && beta != 1 && c != nullptr; - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + _run_activation = + gemm_info.activation_info().enabled() && + (!run_optimised || + (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); - if(run_optimised) + if (run_optimised) { const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); @@ -90,10 +102,11 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; // Scale product by alpha - if(_run_alpha_scale) + if (_run_alpha_scale) { _alpha_scale_func = std::make_unique<cpu::CpuActivation>(); - _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + _alpha_scale_func->configure( + d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); } } else @@ -104,7 +117,7 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) + if (_run_vector_matrix_multiplication) { // Configure the matrix multiply kernel _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); @@ -118,41 +131,50 @@ void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITenso // Configure interleave kernel _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); _interleave_kernel->configure(a, &_tmp_a); - _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[InterleavedLHS] = + MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); // Configure transpose kernel _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + _aux_mem[TransposedRHS] = + MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); // Configure matrix multiplication kernel _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); } - if(_run_bias_addition) + if (_run_bias_addition) { _add_bias = std::make_unique<cpu::CpuAdd>(); _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); + _aux_mem[TempResult] = + MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); } } // Configure matrix addition kernel - if(_run_addition) + if (_run_addition) { _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>(); _ma_kernel->configure(c, d, beta); } // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func = std::make_unique<cpu::CpuActivation>(); _activation_func->configure(d, nullptr, gemm_info.activation_info()); } } -Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) +Status CpuGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); const bool is_c_bias = beta == 1 && c != nullptr; @@ -162,7 +184,7 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_fixed_format_fast_math(gemm_info.weight_format())) + if (is_fixed_format_fast_math(gemm_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -174,46 +196,54 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens const int block_by = arm_compute::block_by(gemm_info.weight_format()); // test if im2col has changed the dimensions that are needed for padding - if(a->dimension(0) != b->dimension(1) && block_by > 1) + if (a->dimension(0) != b->dimension(1) && block_by > 1) { // have to verify bias const size_t dim0_sz = a->dimension(0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz % block_by) != 0, + ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right // b->dimension(1) = kernel_area * input_channel // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by; const size_t kernel_area = (dim0_sz - b->dimension(1)) / input_pad_right; - ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz - kernel_area * input_pad_right) != b->dimension(1), + "The product AB is defined only if A number of columns and B number of rows are related"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->dimension(0) != b->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); } ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) + if (a->data_type() != DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); } - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), + "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), + "The C matrix must have the same number of columns as the matrix B"); } - if(d->total_size() != 0) + if (d->total_size() != 0) { // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) + if (gemm_info.depth_output_gemm3d() != 0) { - if(gemm_info.reinterpret_input_as_3d()) + if (gemm_info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); @@ -230,15 +260,19 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens } // Check if we need to run the optimized assembly kernel - cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && - (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. - !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. - - if(!run_optimised) + cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), + "CpuGemm cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, + "CpuGemm cannot reinterpret the output tensor as 3D"); // Check if the first input tensor is a vector. const bool run_vector_matrix_multiplication = a->dimension(1) < 2; @@ -254,7 +288,8 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens int mult_transpose1xW_width = 1; int mult_interleave4x4_height = 1; - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo( + m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); const ITensorInfo *matrix_a_info = a; const ITensorInfo *matrix_b_info = b; @@ -263,39 +298,44 @@ Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITens TensorInfo tmp_b_info{}; TensorInfo tmp_output_info = *d->clone(); - if(run_interleave_transpose) + if (run_interleave_transpose) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape( + *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape( + *b, mult_transpose1xW_width))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); } // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + auto_init_if_empty(tmp_output_info, + matrix_a_info->clone()->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - if(is_c_bias) + if (is_c_bias) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); } } // Validate matrix addition kernel - if(run_addition) + if (run_addition) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); } @@ -312,15 +352,15 @@ void CpuGemm::run(ITensorPack &tensors) auto c = tensors.get_const_tensor(ACL_SRC_2); auto d = tensors.get_tensor(ACL_DST); - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { // Pass c to asm dispatch only if it's the bias tensor ITensorPack asm_pack = tensors; asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr); _asm_glue->run(asm_pack); - if(_run_alpha_scale) + if (_run_alpha_scale) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _alpha_scale_func->run(pack); } } @@ -330,18 +370,20 @@ void CpuGemm::run(ITensorPack &tensors) CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); - ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } }; - if(!_run_vector_matrix_multiplication) + ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; + if (!_run_vector_matrix_multiplication) { // Run interleave kernel - ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } }; - NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); + ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; + NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), + interleave_pack); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } // Use reshaped matrices @@ -349,48 +391,52 @@ void CpuGemm::run(ITensorPack &tensors) mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); } - NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack); + NEScheduler::get().schedule_op(_mm_kernel.get(), + _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, + _mm_kernel->window(), mm_pack); // Run bias addition kernel - if(_run_bias_addition) + if (_run_bias_addition) { - ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}}; _add_bias->run(pack); } } // Run matrix addition kernel - if(_run_addition) + if (_run_addition) { - ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } }; + ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}}; NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); } // Run activation function - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; _activation_func->run(pack); } } void CpuGemm::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - if(_asm_glue && _asm_glue->is_configured()) + if (_asm_glue && _asm_glue->is_configured()) { _asm_glue->prepare(tensors); } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + ITensor *b_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS))); ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); + ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), + transpose_pack); } _is_prepared = true; } @@ -401,8 +447,12 @@ experimental::MemoryRequirements CpuGemm::workspace() const return _aux_mem; } -Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info) +Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info) { const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h index 9b08e5d0f6..6b30d134fa 100644 --- a/src/cpu/operators/CpuGemm.h +++ b/src/cpu/operators/CpuGemm.h @@ -24,12 +24,12 @@ #ifndef ARM_COMPUTE_CPU_GEMM_H #define ARM_COMPUTE_CPU_GEMM_H -#include "src/cpu/ICpuOperator.h" - #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/GEMMInfo.h" + +#include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" @@ -93,16 +93,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should happen only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. * * Similar to @ref CpuGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -111,12 +121,16 @@ public: * the value of arm_compute::WeightFormat need to be passed via the * parameter gemm_info. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const GEMMInfo &gemm_info = GEMMInfo()); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; /** Indicates if the convolution executes in variable weights mode. @@ -138,28 +152,28 @@ private: Count }; - std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{ nullptr }; - std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{ nullptr }; - std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr }; - std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr }; - std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr }; - std::unique_ptr<CpuActivation> _alpha_scale_func{ nullptr }; - std::unique_ptr<CpuAdd> _add_bias{ nullptr }; - std::unique_ptr<CpuActivation> _activation_func{ nullptr }; + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr}; + std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr}; + std::unique_ptr<CpuAdd> _add_bias{nullptr}; + std::unique_ptr<CpuActivation> _activation_func{nullptr}; TensorInfo _tmp_a{}; TensorInfo _tmp_b{}; TensorInfo _tmp_d{}; - bool _run_vector_matrix_multiplication{ false }; - bool _run_alpha_scale{ false }; - bool _run_addition{ false }; - bool _run_bias_addition{ false }; - bool _run_activation{ false }; - bool _reshape_b_only_on_first_run{ false }; - bool _is_prepared{ false }; + bool _run_vector_matrix_multiplication{false}; + bool _run_alpha_scale{false}; + bool _run_addition{false}; + bool _run_bias_addition{false}; + bool _run_activation{false}; + bool _reshape_b_only_on_first_run{false}; + bool _is_prepared{false}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp index 39b410d609..7c59d88c61 100644 --- a/src/cpu/operators/CpuGemmConv2d.cpp +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -26,9 +26,9 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" @@ -52,8 +52,11 @@ namespace arm_compute { namespace cpu { -CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info) +CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -62,63 +65,86 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - if(skip_im2col) + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + if (skip_im2col) { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); + if (skip_col2im) { - return { true, true }; + return {true, true}; } } else { - const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); - if(skip_col2im) + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); + if (skip_col2im) { - return { false, true }; + return {false, true}; } } // Default case when we cannot reinterpret the input and output as 3D. - return { false, false }; + return {false, false}; } CpuGemmConv2d::CpuGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), - _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(), + _mm_gemm(), + _mm_gemmlowp(), + _col2im_kernel(), + _reshape(), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _gemm_output_3d(), + _data_layout(DataLayout::NCHW), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } CpuGemmConv2d::~CpuGemmConv2d() = default; -void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, - bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format) +void CpuGemmConv2d::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool fixed_format, + arm_compute::WeightFormat weight_format) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format)); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, + _skip_im2col, fixed_format, weight_format)); // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo &gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); // Supported activations in GEMM - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(_is_quantized) + if (_is_quantized) { - TensorInfo tmp_src{ *src }; - TensorInfo tmp_weights{ *weights }; + TensorInfo tmp_src{*src}; + TensorInfo tmp_weights{*weights}; // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo iqinfo = src->quantization_info(); @@ -129,7 +155,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig const DataType data_type = src->data_type(); tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(tmp_weights.data_type())) + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) { const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); @@ -142,7 +168,7 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act_info.activation()) != 0) + if (supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -156,11 +182,12 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format, - weight_format)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, + enable_fast_math, false, act_info, fixed_format, weight_format)); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -171,26 +198,35 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weig _mm_gemm = std::make_unique<CpuGemm>(); _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format) +Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool skip_im2col, + bool fixed_format, + arm_compute::WeightFormat weight_format) { const DataType data_type = src->data_type(); const bool is_quantized = is_data_type_quantized_asymmetric(data_type); const bool is_activation_enabled = act_info.enabled(); // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weight_format); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -206,11 +242,10 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); } @@ -229,8 +264,9 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math, - false, act_info)); + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, + output_info, false, enable_fast_math, false, act_info)); } else { @@ -239,36 +275,44 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *wei } } -Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) +Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col) { const DataType data_type = input_info->data_type(); const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, + input_info->quantization_info()); const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); + const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, + input_info->quantization_info()); - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col); + return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, + gemm_3d_depth, skip_im2col); } -void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void CpuGemmConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, - weights, - biases, - dst, - conv_info, - weights_info, - dilation, - act_info, - enable_fast_math, - num_groups)); - ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, + num_groups); const DataType data_type = src->data_type(); const DataLayout data_layout = src->data_layout(); @@ -283,7 +327,8 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); const ITensorInfo *gemm_input_to_use = src; ITensorInfo *gemm_output_to_use = dst; @@ -291,20 +336,17 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); - _skip_im2col = skip_info.skip_im2col; - _skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + _skip_im2col = skip_info.skip_im2col; + _skip_col2im = skip_info.skip_col2im; // Get parameters from conv_info unsigned int stride_x = 0; @@ -320,17 +362,19 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights _weights_reshaped.set_quantization_info(weights->quantization_info()); // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); unsigned int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); } // Configure _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>(); - _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right); + _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, + num_groups, input_pad_right); // Update GEMM input gemm_input_to_use = &_im2col_output; @@ -338,7 +382,7 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -368,9 +412,10 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format()); + configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, + gemm_3d_depth, fixed_format, weights_info.weight_format()); - if(!_skip_col2im && _data_layout == DataLayout::NCHW) + if (!_skip_col2im && _data_layout == DataLayout::NCHW) { // Configure col2im _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>(); @@ -390,14 +435,24 @@ void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS // Check lifetime - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), + gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, + _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { const DataLayout data_layout = src->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -406,36 +461,44 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo const unsigned int kernel_height = weights->dimension(idx_height); unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); const bool skip_im2col = skip_info.skip_im2col; const bool skip_col2im = skip_info.skip_col2im; const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(), + false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format()); return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); } -Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status CpuGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); - if(!is_fixed_format(weights_info.weight_format())) + if (!is_fixed_format(weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -468,29 +531,25 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); // Check if GEMM3D is supported - const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, - dilation, act_info); - const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(is_bf16) + else if (is_bf16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -503,20 +562,23 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight } unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); + unsigned int mat_weights_rows = + weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type()); weights_reshaped_info.set_quantization_info(weights->quantization_info()); weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const int block_by = arm_compute::block_by(weights_info.weight_format()); int input_pad_right = 0; - if(block_by > 1) + if (block_by > 1) { - input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); - mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right); + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * + (weights->dimension(idx_channel) + input_pad_right); } // Create tensor info for im2col reshaped inputs @@ -528,13 +590,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); im2col_reshaped_info.set_quantization_info(src->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), + conv_info, append_bias, dilation, num_groups, input_pad_right)); gemm_input_to_use = &im2col_reshaped_info; } // Create temporary GEMM output tensor in case we cannot skip col2im const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); shape_gemm.set(0, mat_weights_cols); @@ -549,13 +613,15 @@ Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weight gemm_output_to_use = &info_gemm; const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, + enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, weights_info.weight_format())); // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) + if (!skip_col2im && (data_layout == DataLayout::NCHW)) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); } return Status{}; @@ -574,15 +640,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); - if(!_skip_im2col) + if (!_skip_im2col) { // Run input reshaping unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack); gemm_input_to_use = im2col_output.get(); } @@ -595,11 +657,11 @@ void CpuGemmConv2d::run(ITensorPack &tensors) gemm3d.allocator()->import_memory(out_to_use->buffer()); auto gemm_output_to_use = gemm_output.get(); - if(_skip_im2col) + if (_skip_im2col) { gemm_output_to_use = &gemm3d; } - if(_skip_col2im && !out_has_padding) + if (_skip_col2im && !out_has_padding) { gemm_output_to_use = dst; } @@ -607,12 +669,12 @@ void CpuGemmConv2d::run(ITensorPack &tensors) // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); - if(!this->isVarWeightsKernel()) + if (!this->isVarWeightsKernel()) { pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -624,45 +686,33 @@ void CpuGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output.get() }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}}; NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); } else { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } - else if(out_has_padding) + else if (out_has_padding) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; _reshape->run(pack); } } void CpuGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Variable weights executions that use fixed-format kernels // need no reshaping of the weights. - if(this->isVarWeightsKernel()) + if (this->isVarWeightsKernel()) { _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); _is_prepared = true; @@ -672,11 +722,7 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors) // Run weights reshaping and mark original weights tensor as unused CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack); weights->mark_as_unused(); ITensorPack gemm_pack = tensors; diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h index 61fe63a79f..118d366517 100644 --- a/src/cpu/operators/CpuGemmConv2d.h +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" #include <memory> @@ -106,17 +107,32 @@ public: * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, unsigned int num_groups = 1); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -124,10 +140,16 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - const bool enable_fast_math = false); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const bool enable_fast_math = false); // Inherited methods overridden: void run(ITensorPack &tensors) override; @@ -150,8 +172,15 @@ private: * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. */ - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -170,8 +199,16 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool skip_im2col = false, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -182,7 +219,11 @@ private: * * @return a status */ - static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); + static Status validate_gemm3d(const ITensorInfo *src, + const ITensorInfo *weights, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col); struct SkipInfo { @@ -200,8 +241,11 @@ private: * * @return a SkipInfo instance. */ - static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info, - const Size2D &dilation, const ActivationLayerInfo &act_info); + static SkipInfo skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info); /** Indicates if the convolution executes in variable weights mode. * @@ -236,7 +280,7 @@ private: bool _is_quantized; bool _is_prepared; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp index 5ce285cb6f..8fa81b1907 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp @@ -26,10 +26,10 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" - #include "support/Cast.h" #include <set> @@ -43,7 +43,10 @@ using namespace arm_compute::utils::cast; namespace { -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -53,16 +56,15 @@ GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); const DataType data_type = src->data_type(); // Merge activation with output stage - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + PixelValue type_min{}; + PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); int32_t min_activation = type_min.get<int32_t>(); int32_t max_activation = type_max.get<int32_t>(); - if(supported_acts.count(act.activation()) != 0) + if (supported_acts.count(act.activation()) != 0) { std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); } @@ -107,31 +109,32 @@ CpuGemmDirectConv2d::CpuGemmDirectConv2d() CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; -void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - info)); + ARM_COMPUTE_ERROR_THROW_ON( + CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); _is_prepared = false; - _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 }); + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2}); // Configure assembly dispatch cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); } _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); // Configure activation - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, info.act_info); } @@ -141,24 +144,33 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *w _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - if(_aux_mem[Pretranspose].size > 0) + if (_aux_mem[Pretranspose].size > 0) { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); } else { // We must permute weights if they are WeightFormat::UNSPECIFIED - if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); + if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); } } -Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - if(!is_fixed_format(info.weights_info.weight_format())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + if (!is_fixed_format(info.weights_info.weight_format())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); } @@ -171,13 +183,13 @@ Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo * ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } - else if(data_type == DataType::BFLOAT16) + else if (data_type == DataType::BFLOAT16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); } @@ -198,31 +210,32 @@ void CpuGemmDirectConv2d::run(ITensorPack &tensors) prepare(tensors); _gemm_asm_func->run(tensors); - if(_run_activation) + if (_run_activation) { ITensor *io = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } }; + ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}}; _activation_func->run(pack); } } void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // If we are using fixed-format kernel the weights are already reshaped - if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) + if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) { _gemm_asm_func->prepare(tensors); _is_prepared = true; return; } - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _weights_permute_func->run(permute_tensors); tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h index e55a461f36..1cc3caadae 100644 --- a/src/cpu/operators/CpuGemmDirectConv2d.h +++ b/src/cpu/operators/CpuGemmDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuActivation.h" @@ -69,18 +70,26 @@ public: * Data types supported: Same as @p input. * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d * * Similar to CpuGemmDirectConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 8ca128fb07..2ee879b67b 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" @@ -59,12 +59,12 @@ namespace cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) { cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.output_stage = info.gemmlowp_output_stage(); - asm_info.fast_mode = info.fast_math(); + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); return asm_info; } @@ -105,7 +105,8 @@ CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() } CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; -void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) +void CpuGemmLowpMatrixMultiplyCore::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); @@ -122,28 +123,31 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _reshape_b_only_on_first_run = b->are_values_constant(); _is_prepared = false; _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _gemm_info = gemm_info; + _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && + _reshape_b_only_on_first_run; + _gemm_info = gemm_info; _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); const ITensorInfo *a_to_use = a; // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) + if (_flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); _convert_to_signed_asymm->configure(a_to_use, &_signed_a); a_to_use = &_signed_a; _a_offset = _signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + _signed_output = dst->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -157,7 +161,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { _fuse_output_stage = true; _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); @@ -166,16 +170,18 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso // Initialize assembly kernel meta-data const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); #ifdef __aarch64__ - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - switch(a->data_type()) + switch (a->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: case DataType::U8: case DataType::S8: { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { auto c_info_to_use = c == nullptr ? nullptr : c; _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); @@ -197,13 +203,14 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } } #endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) + if (!(_assembly_path || _run_vector_matrix_multiplication)) { matrix_a = &_tmp_a; matrix_b = &_tmp_b; // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); + _tmp_a = + TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); @@ -216,13 +223,13 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_b_reshape_kernel->configure(b, &_tmp_b); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Build reduction info const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); @@ -232,7 +239,7 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); @@ -241,24 +248,23 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); } - if(_fuse_output_stage) + if (_fuse_output_stage) { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); } - _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : dst, - a->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); + _offset_contribution_output_stage_kernel = + std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); + _offset_contribution_output_stage_kernel->configure( + &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst, + a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage()); - if(_flip_signedness) + if (_flip_signedness) { _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); _convert_from_signed_asymm->configure(&_signed_output, dst); @@ -267,27 +273,29 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso else { // Configure matrix multiply kernel - if(!_assembly_path) + if (!_assembly_path) { _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); _mm_kernel->configure(matrix_a, matrix_b, dst); } // Configure offset contribution kernel _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>(); - _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), + _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), _a_offset, _b_offset); } } // Configure activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) + _run_activation = + activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if (_run_activation) { _activation_func = std::make_unique<CpuActivation>(); _activation_func->configure(dst, nullptr, activation); } - if(_assembly_path) + if (_assembly_path) { auto asm_mem_req = _asm_glue->workspace(); _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; @@ -295,27 +303,41 @@ void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITenso } // Request memory for LHS and RHS reshape matrix - _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0 - && _reshape_b_only_on_first_run ? - MemoryLifetime::Persistent : - MemoryLifetime::Temporary, - _vector_sum_col.total_size()); - _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); - _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); + _aux_mem[VectorSumCol] = + MemoryInfo(offset_int_vec(VectorSumCol), + !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent + : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + _aux_mem[VectorSumRow] = + MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _tmp_b.total_size()); + _aux_mem[MMResultS32] = + MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); + _aux_mem[SignedOutput] = + MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); } -Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); @@ -333,28 +355,32 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens int32_t b_offset = b->quantization_info().uniform().offset; bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) + if (fuse_output_stage) { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); } // Convert QASYMM8->QASYMM8_SIGNED TensorInfo signed_a{}; TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if (flip_signedness) { const int32_t offset_correction = 128; const DataType dt = DataType::QASYMM8_SIGNED; const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); a_to_use = &signed_a; a_offset = signed_a.quantization_info().uniform().offset; const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + signed_output = output->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); // Output stage correction GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); @@ -374,25 +400,28 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens bool run_optimised = false; bool run_optimised_requantized = false; - if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); run_optimised_requantized = run_optimised; } else { - run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); + run_optimised = bool(CpuGemmAssemblyDispatch::validate( + a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); } } - if(run_optimised) + if (run_optimised) { ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) + if (info.depth_output_gemm3d() != 0) { - if(info.reinterpret_input_as_3d()) + if (info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); @@ -409,11 +438,13 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) + if (!run_vector_matrix_multiplication) { matrix_a_info = &tmp_a_info; matrix_b_info = &tmp_b_info; @@ -437,7 +468,7 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens } } - if(!run_optimised_requantized) + if (!run_optimised_requantized) { TensorInfo info_vector_sum_col{}; TensorInfo info_vector_sum_row{}; @@ -445,62 +476,70 @@ Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITens const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); } - if(fuse_output_stage) + if (fuse_output_stage) { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset, + b_offset, info.gemmlowp_output_stage())); } else { - if(!run_optimised) + if (!run_optimised) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + a_offset, b_offset)); } } // Validate activation const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) + if (activation.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); } @@ -529,24 +568,22 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) + if (_flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a }, - { TensorType::ACL_DST, signed_a.get() } - }; - NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}}; + NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), + pack); a_to_use = signed_a.get(); matrix_a = signed_a.get(); } // Run GEMM - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { ITensorPack asm_glue_tensors = tensors; auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && + _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); @@ -563,35 +600,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - if(!_run_vector_matrix_multiplication) + if (!_run_vector_matrix_multiplication) { matrix_a = tmp_a.get(); matrix_b = tmp_b.get(); // Run interleave kernel - ITensorPack pack_a = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, tmp_a.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a); + ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), + pack_a); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { - ITensorPack pack_b = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}}; // Run transpose kernel - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b); + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, + _mtx_b_reshape_kernel->window(), pack_b); } } - ITensorPack pack_mm = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b } - }; - if(_fuse_output_stage) + ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}}; + if (_fuse_output_stage) { pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); } @@ -602,31 +629,25 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); } - if(!_fused_assembly_path) + if (!_fused_assembly_path) { // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack pack = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, vector_sum_row.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, + _mtx_a_reduction_kernel->window(), pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack pack = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } - if(_fuse_output_stage) + if (_fuse_output_stage) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); @@ -636,7 +657,8 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, + _offset_contribution_output_stage_kernel->window(), pack); } else { @@ -646,68 +668,57 @@ void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) pack.add_tensor(TensorType::ACL_DST, dst); // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack); + NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, + _offset_contribution_kernel->window(), pack); } } // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness) { - ITensorPack pack = - { - { TensorType::ACL_SRC, signed_output.get() }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, + _convert_from_signed_asymm->window(), pack); } // Run fused activation unless already run in the fused assembly - if(_run_activation) + if (_run_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; _activation_func->run(pack); } } void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); // Run assembly reshape - if(_asm_glue->is_configured()) + if (_asm_glue->is_configured()) { _asm_glue->prepare(tensors); } // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) { // Run reshape kernel and mark original weights tensor as unused - ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); + ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, tmp_b.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), + pack); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) { - ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); + ITensor *vector_sum_col_p = + utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); } _is_prepared = true; } diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h index a1b34291d0..a7798938e7 100644 --- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/GEMMInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -108,18 +109,26 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp index 58f98acff0..4215eed199 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp +++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" @@ -36,36 +37,42 @@ namespace arm_compute { namespace cpu { -void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void CpuGemmLowpOutputStage::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QASYMM8_SIGNED: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); _kernel = std::move(k); break; } case DataType::QSYMM16: { auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, + info.gemmlowp_max_bound); _kernel = std::move(k); break; } @@ -79,7 +86,7 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(info.output_data_type) + switch (info.output_data_type) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -102,32 +109,41 @@ void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITen } } -Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, + "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && + (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: - return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QASYMM8_SIGNED: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); case DataType::QSYMM16: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); default: return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); } } case GEMMLowpOutputStageType::QUANTIZE_DOWN: { - switch(dst->data_type()) + switch (dst->data_type()) { case DataType::QASYMM8: case DataType::QASYMM8_SIGNED: @@ -146,4 +162,4 @@ void CpuGemmLowpOutputStage::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h index 39394f6b5f..e5e2f41fa9 100644 --- a/src/cpu/operators/CpuGemmLowpOutputStage.h +++ b/src/cpu/operators/CpuGemmLowpOutputStage.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H #include "arm_compute/core/Types.h" + #include "src/cpu/ICpuOperator.h" /** This file contains all available output stages for GEMMLowp. @@ -76,7 +77,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp index 8811a7ea6b..89087129c3 100644 --- a/src/cpu/operators/CpuMatMul.cpp +++ b/src/cpu/operators/CpuMatMul.cpp @@ -23,14 +23,16 @@ */ #include "src/cpu/operators/CpuMatMul.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" + #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/function_info/MatMulInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEMatMul.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,8 +48,11 @@ namespace cpu { namespace { -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) { const auto data_type = src->data_type(); const QuantizationInfo oq_info = dst->quantization_info(); @@ -59,10 +64,11 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo int32_t output_multiplier; int32_t output_shift; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - int32_t type_min = 0; - int32_t type_max = 0; + int32_t type_min = 0; + int32_t type_max = 0; std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; @@ -77,14 +83,27 @@ Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo } // namespace CpuMatMul::CpuMatMul() - : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape() + : _transpose_kernel_lhs(), + _transpose_kernel_rhs(), + _asm_glue(), + _lhs_transposed(), + _rhs_transposed(), + _original_lhs_shape(), + _original_rhs_shape(), + _original_dst_shape() { } -Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status CpuMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic."); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); @@ -103,34 +122,39 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const gemm_info.fast_mode = settings.fast_math(); // Validate and then permute a/b - if(adj_lhs) + if (adj_lhs) { - auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); + auto_init_if_empty(lhs_transposed, + lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed)); // Assign lhs_to_use pointer to use transposed TensorInfo lhs_to_use = &lhs_transposed; } - if(adj_rhs) + if (adj_rhs) { - auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); + auto_init_if_empty(rhs_transposed, + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed)); // Assign rhs_to_use pointer to use transposed TensorInfo rhs_to_use = &rhs_transposed; } ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)"); + "The product AB is defined only if the number of columns in A is equal to the " + "number of rows in B (after transpose)"); // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors - for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) + for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), + "Broadcasting in Batch dimension is unsupported by this operator."); } // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage)); + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, + gemm_info.activation_info, gemm_info.output_stage)); } cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); @@ -138,7 +162,12 @@ Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const return Status{}; } -void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void CpuMatMul::configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); @@ -163,21 +192,23 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, _original_rhs_shape = rhs_to_use.tensor_shape(); // Reshape lhs for use with assembly kernels. - lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); - dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); + lhs_to_use.set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); + dst_to_use.set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // 2. Configuration for transpose of lhs/rhs // ------------------------------------------------------ // Initialise transposed TensorInfo class for aux tensors (intermediary tensors) - if(_adj_lhs) + if (_adj_lhs) { // Setup transpose LHS _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); } - if(_adj_rhs) + if (_adj_rhs) { // Setup transpose RHS _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); @@ -196,20 +227,22 @@ void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; // Quantized-specific configuration - if(is_data_type_quantized(lhs->data_type())) + if (is_data_type_quantized(lhs->data_type())) { - get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage); + get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, + _gemm_info.output_stage); } // Configure Asm Kernel _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); - _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul + _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, + _gemm_info); // c is nullptr as bias not supported in MatMul // Specify memory requirements for intermediate tensors auto asm_mem_req = _asm_glue->workspace(); // Specify memory required by gemm kernel int idx = 0; - for(const auto &aux : asm_mem_req) + for (const auto &aux : asm_mem_req) { _aux_mem[idx] = aux; idx++; @@ -228,8 +261,12 @@ void CpuMatMul::run(ITensorPack &tensors) // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm) // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly) - lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z - dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + lhs->info()->set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, + _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + dst->info()->set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, + _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2)); // Initialise object to handle stored transposed tensors in auxillary memory @@ -240,17 +277,19 @@ void CpuMatMul::run(ITensorPack &tensors) ITensorPack asm_tensors(tensors); // Run transpose lhs if necessary - if(_adj_lhs) + if (_adj_lhs) { - ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack); + ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), + lhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get()); } // Run transpose rhs if necessary - if(_adj_rhs) + if (_adj_rhs) { - ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack); + ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), + rhs_transpose_pack); asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get()); } // Run asm kernel diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h index 475c019fd0..24db3da346 100644 --- a/src/cpu/operators/CpuMatMul.h +++ b/src/cpu/operators/CpuMatMul.h @@ -25,6 +25,7 @@ #define ACL_SRC_CPU_OPERATORS_CPUMATMUL #include "arm_compute/core/TensorInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/kernels/CpuTransposeKernel.h" @@ -66,18 +67,27 @@ public: * @param[in] settings The settings for matmul operation (i.e fast math) * @param[in] act_info Class containing information about fused activation function. */ - void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuMatMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -91,9 +101,9 @@ private: }; // Define unique pointers to kernels/operators used by matmul - std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr }; - std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr }; - std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{ nullptr }; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr}; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; // TensorInfo for tensors stored in auxillary memory TensorInfo _lhs_transposed{}; @@ -105,13 +115,13 @@ private: TensorShape _original_dst_shape{}; // Note : adj_lhs means the same as transposing lhs - bool _adj_lhs{ false }; - bool _adj_rhs{ false }; - bool _fast_math{ false }; + bool _adj_lhs{false}; + bool _adj_rhs{false}; + bool _fast_math{false}; AsmGemmInfo _gemm_info{}; - experimental::MemoryRequirements _aux_mem{ Count }; + experimental::MemoryRequirements _aux_mem{Count}; }; -} -} +} // namespace cpu +} // namespace arm_compute #endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */ diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp index 24e9fd6d46..697fc40ab3 100644 --- a/src/cpu/operators/CpuMaxUnpooling.cpp +++ b/src/cpu/operators/CpuMaxUnpooling.cpp @@ -22,6 +22,7 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuMaxUnpooling.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" @@ -29,7 +30,10 @@ namespace arm_compute { namespace cpu { -void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info) +void CpuMaxUnpooling::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info); auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>(); @@ -37,9 +41,12 @@ void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indic _kernel = std::move(k); } -Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info) +Status CpuMaxUnpooling::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) { return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h index aa1f1072a5..5dc00bce9e 100644 --- a/src/cpu/operators/CpuMaxUnpooling.h +++ b/src/cpu/operators/CpuMaxUnpooling.h @@ -44,14 +44,18 @@ public: * @param[out] dst Destination tensor. Data types supported: Same as @p src * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ - void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuMaxUnpooling::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp index 4c15015206..ac9847111d 100644 --- a/src/cpu/operators/CpuMul.cpp +++ b/src/cpu/operators/CpuMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMulKernel.h" @@ -33,14 +34,24 @@ namespace arm_compute { namespace cpu { -Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status CpuMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); } -void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void CpuMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -58,13 +69,19 @@ void CpuMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); } -Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status CpuComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuComplexMulKernel::validate(src1, src2, dst); } -void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void CpuComplexMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); @@ -80,4 +97,4 @@ void CpuComplexMul::run(ITensorPack &tensors) NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h index 3e0edbf050..82b309830b 100644 --- a/src/cpu/operators/CpuMul.h +++ b/src/cpu/operators/CpuMul.h @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -61,7 +62,12 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -69,7 +75,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -89,14 +100,20 @@ public: * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp index babaf21b6f..25acc92d00 100644 --- a/src/cpu/operators/CpuPermute.cpp +++ b/src/cpu/operators/CpuPermute.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuPermute.h" -#include "src/cpu/kernels/CpuPermuteKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPermuteKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, cons { return kernels::CpuPermuteKernel::validate(src, dst, perm); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp index 722cd36ee5..b72bde6978 100644 --- a/src/cpu/operators/CpuPool2d.cpp +++ b/src/cpu/operators/CpuPool2d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool2dKernel.h" #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" @@ -53,7 +54,8 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices); // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); // Get data layout _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; @@ -61,10 +63,11 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Check if we have Global Pooling Layer const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - _use_kernel_indices = pool_info.use_kernel_indices; + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && + (src->dimension(idx_height) == pool_info.pool_size.height); + _use_kernel_indices = pool_info.use_kernel_indices; - if(run_optimised) + if (run_optimised) { const CPUInfo &ci = NEScheduler::get().cpu_info(); const unsigned int num_threads = NEScheduler::get().num_threads(); @@ -76,7 +79,7 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer // Get kernel's memory requirements constexpr size_t alignment = 4096; const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); _asm_glue = std::move(pooling_wrapper); } @@ -89,11 +92,15 @@ void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayer } } -Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status CpuPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - if(run_optimised) + if (run_optimised) { return Status{}; } @@ -105,20 +112,24 @@ void CpuPool2d::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - if(_asm_glue) + if (_asm_glue) { const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); } else { - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + _is_global_pooling_layer ? Window::DimZ : Window::DimY, + _pooling_layer_kernel->window(), tensors); break; case DataLayout::NHWC: - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors); + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + (_use_kernel_indices ? Window::DimY : Window::DimX), + _pooling_layer_kernel->window(), tensors); break; default: ARM_COMPUTE_ERROR("Data layout not supported"); diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h index 5c571db88a..ea73e3f335 100644 --- a/src/cpu/operators/CpuPool2d.h +++ b/src/cpu/operators/CpuPool2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL2D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -58,17 +59,21 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to CpuPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp index 14e4ac6c97..7fa78c1f80 100644 --- a/src/cpu/operators/CpuPool3d.cpp +++ b/src/cpu/operators/CpuPool3d.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuPool3dKernel.h" @@ -35,8 +36,7 @@ namespace arm_compute { namespace cpu { -CpuPool3d::CpuPool3d() - : _aux_mem(1) +CpuPool3d::CpuPool3d() : _aux_mem(1) { } @@ -70,4 +70,4 @@ experimental::MemoryRequirements CpuPool3d::workspace() const } } // namespace cpu -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h index 8a73f8a0af..235d798095 100644 --- a/src/cpu/operators/CpuPool3d.h +++ b/src/cpu/operators/CpuPool3d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_POOL3D_H #include "arm_compute/core/experimental/Types.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -61,7 +62,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp index f9e14d1f88..4315499c39 100644 --- a/src/cpu/operators/CpuQuantize.cpp +++ b/src/cpu/operators/CpuQuantize.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuQuantizeKernel.h" diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp index e6892a2e7e..a423abb49a 100644 --- a/src/cpu/operators/CpuReshape.cpp +++ b/src/cpu/operators/CpuReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/cpu/operators/CpuReshape.h" -#include "src/cpu/kernels/CpuReshapeKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuReshapeKernel.h" namespace arm_compute { @@ -49,7 +48,7 @@ Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) void CpuReshape::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - if(!_is_prepared) + if (!_is_prepared) { static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors); _is_prepared = true; diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h index 9bc43e7db4..33da792319 100644 --- a/src/cpu/operators/CpuReshape.h +++ b/src/cpu/operators/CpuReshape.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_RESHAPE_H #define ARM_COMPUTE_CPU_RESHAPE_H -#include "src/cpu/ICpuOperator.h" #include "arm_compute/core/Window.h" +#include "src/cpu/ICpuOperator.h" + namespace arm_compute { namespace cpu @@ -53,7 +54,7 @@ public: void run(ITensorPack &tensors) override; private: - bool _is_prepared{ false } ; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp index 8a712bf088..7df9296931 100644 --- a/src/cpu/operators/CpuScale.cpp +++ b/src/cpu/operators/CpuScale.cpp @@ -24,8 +24,9 @@ #include "src/cpu/operators/CpuScale.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/kernels/CpuScaleKernel.h" @@ -37,11 +38,12 @@ namespace cpu { namespace { -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) +void precompute_dx_dy_offsets( + ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) { ARM_COMPUTE_ERROR_ON(offsets == nullptr); float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) + if (sampling_policy == SamplingPolicy::CENTER) { sampling_offset = 0.5f; } @@ -50,38 +52,44 @@ void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - if(dx != nullptr && dy != nullptr) + if (dx != nullptr && dy != nullptr) { // Pre-compute the offset and pixel's distance for BILINEAR interpolation Iterator offsets_it(offsets, win); Iterator dx_it(dx, win); Iterator dy_it(dy, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; + const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; + const int in_xi = std::floor(in_x); + const int in_yi = std::floor(in_y); - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; + *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; + }, + offsets_it, dx_it, dy_it); } else { // Pre-compute the offset for NEAREST interpolation Iterator offsets_it(offsets, win); - execute_window_loop(win, [&](const Coordinates & id) - { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; - }, - offsets_it); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast<size_t>( + align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) + : std::floor(float_in_xi)); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + }, + offsets_it); } } } // namespace @@ -96,20 +104,24 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn _is_prepared = false; // Get data layout and width/height indices - _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; // Get the tensor shape TensorShape shape(dst->dimension(idx_width)); @@ -122,7 +134,7 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy); auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets); auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>(); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -148,7 +160,8 @@ void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelIn Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); ITensorInfo *offsets = nullptr; ITensorInfo *dx = nullptr; @@ -160,19 +173,25 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape of auxilary buffers const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); TensorInfo tensor_info_offsets(shape, Format::S32); TensorInfo tensor_info_dx(shape, Format::F32); TensorInfo tensor_info_dy(shape, Format::F32); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: offsets = &tensor_info_offsets; @@ -186,13 +205,14 @@ Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const break; } - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); return Status{}; } void CpuScale::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { _is_prepared = true; const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); @@ -206,22 +226,27 @@ void CpuScale::prepare(ITensorPack &tensors) const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; const SamplingPolicy sampling_policy = _scale_info.sampling_policy; - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -245,7 +270,8 @@ void CpuScale::prepare(ITensorPack &tensors) } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && + policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h index ee7c523bad..c12a8e733a 100644 --- a/src/cpu/operators/CpuScale.h +++ b/src/cpu/operators/CpuScale.h @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_CPU_SCALE_H #define ARM_COMPUTE_CPU_SCALE_H +#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/experimental/Types.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" @@ -62,9 +63,9 @@ public: void run(ITensorPack &tensors) override; private: - ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - bool _is_prepared{ false }; + ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + bool _is_prepared{false}; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp index bf4c2fa3a2..e55d7f903e 100644 --- a/src/cpu/operators/CpuSoftmax.cpp +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" @@ -63,13 +64,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis); - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); _needs_permute = actual_axis > 0; - if(_needs_permute) + if (_needs_permute) { - _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_input.configure(src, &_input_permuted, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) @@ -79,10 +82,11 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d // Create intermediate tensors shapes TensorShape max_sum_shape = tmp_input->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); + TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); + TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); // Init intermediate tensors _max = TensorInfo(max_info); @@ -94,13 +98,14 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d _max_kernel = std::move(mk); auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>(); - if(_needs_permute) + if (_needs_permute) { // The normalization kernel stores the result in a permuted output tensor sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); + _permute_output.configure(&_output_permuted, dst, + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); } else { @@ -109,11 +114,15 @@ void CpuSoftmaxGeneric<IS_LOG>::configure(const ITensorInfo *src, ITensorInfo *d } _softmax_kernel = std::move(sm); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _input_permuted.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _output_permuted.total_size()); } template <bool IS_LOG> @@ -123,7 +132,8 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || + static_cast<int32_t>(src->num_dimensions()) <= axis); // Create intermediate tensor info DataType tmp_data_type = src->data_type(); @@ -131,25 +141,33 @@ Status CpuSoftmaxGeneric<IS_LOG>::validate(const ITensorInfo *src, const ITensor TensorShape max_sum_shape = src->tensor_shape(); max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); + const TensorInfo tensor_info_max_sum(src->clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(src->quantization_info()) + .set_is_resizable(true)); const TensorInfo dont_care; - const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); const bool needs_permute = actual_axis > 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); - TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); + TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); } ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate( + &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); return Status{}; } @@ -166,43 +184,38 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); - CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); + CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, + true); ITensorPack max_pack; ITensorPack softmax_pack; - if(_needs_permute) + if (_needs_permute) { - ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; + ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}}; _permute_input.run(permute_in_pack); - max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; + max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}}; - softmax_pack = - { - { TensorType::ACL_SRC_0, input_permuted.get() }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, output_permuted.get() }, - { TensorType::ACL_DST_1, tmp.get() } - }; + softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, output_permuted.get()}, + {TensorType::ACL_DST_1, tmp.get()}}; } else { - max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, src }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, dst }, - { TensorType::ACL_DST_1, tmp.get() } - }; + max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}}; + + softmax_pack = {{TensorType::ACL_SRC_0, src}, + {TensorType::ACL_SRC_1, max.get()}, + {TensorType::ACL_DST_0, dst}, + {TensorType::ACL_DST_1, tmp.get()}}; } NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); - if(_needs_permute) + if (_needs_permute) { ITensorPack permute_out_pack; permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); @@ -211,7 +224,7 @@ void CpuSoftmaxGeneric<IS_LOG>::run(ITensorPack &tensors) } } -template <bool IS_LOG> +template <bool IS_LOG> experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const { return _aux_mem; diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h index 64df8704f9..8cab70e14f 100644 --- a/src/cpu/operators/CpuSoftmax.h +++ b/src/cpu/operators/CpuSoftmax.h @@ -24,11 +24,13 @@ #ifndef ARM_COMPUTE_CPU_SOFTMAX_H #define ARM_COMPUTE_CPU_SOFTMAX_H -#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/TensorInfo.h" + #include "src/cpu/ICpuKernel.h" #include "src/cpu/ICpuOperator.h" #include "src/cpu/operators/CpuPermute.h" + #include <memory> namespace arm_compute @@ -77,7 +79,7 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp index 91a5b6e63c..7d27efbc96 100644 --- a/src/cpu/operators/CpuSub.cpp +++ b/src/cpu/operators/CpuSub.cpp @@ -23,17 +23,20 @@ */ #include "src/cpu/operators/CpuSub.h" -#include "src/cpu/kernels/CpuSubKernel.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "src/common/utils/Log.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "src/cpu/kernels/CpuSubKernel.h" namespace arm_compute { namespace cpu { -void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void CpuSub::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy); @@ -42,7 +45,11 @@ void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensor _kernel = std::move(k); } -Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status CpuSub::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return kernels::CpuSubKernel::validate(src0, src1, dst, policy); diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h index 88908637aa..d1782a1d3c 100644 --- a/src/cpu/operators/CpuSub.h +++ b/src/cpu/operators/CpuSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/cpu/ICpuOperator.h" namespace arm_compute @@ -53,14 +54,22 @@ public: * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref CpuSub::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp index 4e7854fd6e..ea548e0511 100644 --- a/src/cpu/operators/CpuTranspose.cpp +++ b/src/cpu/operators/CpuTranspose.cpp @@ -23,9 +23,8 @@ */ #include "src/cpu/operators/CpuTranspose.h" -#include "src/cpu/kernels/CpuTransposeKernel.h" - #include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" namespace arm_compute { @@ -43,5 +42,5 @@ Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) { return kernels::CpuTransposeKernel::validate(src, dst); } -} // namesapce cpu +} // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp index c4edd89964..9d07736c13 100644 --- a/src/cpu/operators/CpuWinogradConv2d.cpp +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -22,23 +22,25 @@ * SOFTWARE. */ #include "src/cpu/operators/CpuWinogradConv2d.h" + #include "arm_compute/core/Error.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/WindowHelpers.h" #include "src/core/NEON/kernels/assembly/winograd.hpp" #include "src/core/NEON/kernels/convolution/common/tensor.hpp" #include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuPermute.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" @@ -56,21 +58,26 @@ namespace inline Tensor4DShape internal_get_shape(const ITensorInfo *in) { const DataLayout data_layout = in->data_layout(); - const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); + const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; + return Tensor4DShape{in_batches, in_height, in_width, in_channels}; } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) { ARM_COMPUTE_UNUSED(dst, weights); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd layer only supports unit strides."); + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); @@ -80,43 +87,46 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math, - arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) +bool get_winograd_kernel_implementation(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + arm_conv::winograd::WinogradImpl *winograd_impl, + std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) { arm_conv::winograd::WinogradConfig winograd_cfg; arm_gemm::GemmConfig cfg; const DataType data_type = src->data_type(); - Tensor4DShape in_shape{ internal_get_shape(src) }; - Tensor4DShape out_shape{ internal_get_shape(dst) }; - Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + Tensor4DShape in_shape{internal_get_shape(src)}; + Tensor4DShape out_shape{internal_get_shape(dst)}; + Tensor4DShape kernel_shape{internal_get_shape(weights)}; uint32_t nthreads = NEScheduler::get().num_threads(); // Get configuration arguments for Winograd winograd_cfg.output_rows = 0; winograd_cfg.output_cols = 0; conv_args = std::make_unique<arm_conv::ConvolutionArgs>( - in_shape.n_batches, - arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) }, - in_shape.n_channels, - conv_info.pad_top(), - conv_info.pad_left(), - arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) }, - out_shape.n_channels, - arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) }, - assembly_utils::map_to_arm_gemm_activation(act_info)); + in_shape.n_batches, + arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)}, + in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(), + arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)}, + out_shape.n_channels, + arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)}, + assembly_utils::map_to_arm_gemm_activation(act_info)); bool success = false; - if(data_type == DataType::F32) + if (data_type == DataType::F32) { - success = arm_conv::winograd::get_implementation<float>( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - else if(data_type == DataType::F16) + else if (data_type == DataType::F16) { - success = arm_conv::winograd::get_implementation<__fp16>( - *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr); + success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); } #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) else @@ -127,7 +137,8 @@ bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInf } inline bool fuse_function_supported(const ActivationLayerInfo &act_info) { - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; + return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || + act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; } } // namespace @@ -141,7 +152,7 @@ CpuWinogradConv2d::CpuWinogradConv2d() _permute_output(std::make_unique<CpuPermute>()), _permute_weights(std::make_unique<CpuPermute>()), _aux_mem(AuxTensorIdx::Count), - _conv_args{ nullptr }, + _conv_args{nullptr}, _winograd_impl{}, _data_layout(), _winograd_transformed_input{}, @@ -152,15 +163,20 @@ CpuWinogradConv2d::CpuWinogradConv2d() _weights_hwio(), _input_nhwc(), _output_nhwc(), - _is_prepared{ false }, - _run_activation{ false } + _is_prepared{false}, + _run_activation{false} { } CpuWinogradConv2d::~CpuWinogradConv2d() = default; -void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void CpuWinogradConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); @@ -169,21 +185,29 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei const DataType data_type = src->data_type(); uint32_t nthreads = NEScheduler::get().num_threads(); _data_layout = src->data_layout(); - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; - - bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args); - - ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str()); - - const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); - if(has_impl) + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + + bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &_winograd_impl, _conv_args); + + ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + + const bool has_impl = ((_winograd_impl.input_transform != nullptr) && + (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); + if (has_impl) { // Determine how much working space is required, allocate it. - const size_t input_workspace_size = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); - const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); + const size_t input_workspace_size = + _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); + const size_t output_workspace_size = + _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); @@ -232,7 +256,7 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); @@ -242,28 +266,30 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); // Reorder the convoluted output to ACL's ordering NCHW - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1, + dst->data_type()); _output_nhwc = info; _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); } // Configure input transform kernel - _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); + _transform_input_kernel = + std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); // Configure GEMM function - _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f); + _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, + &_winograd_transformed_output, 1.0f, 0.f); // Configure output transform kernel - _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); + _transform_output_kernel = + std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); //Configure Activation Layer _run_activation = act_info.enabled() && !fuse_function_supported(act_info); - if(_run_activation) + if (_run_activation) { _activation_func->configure(dst, nullptr, act_info); } @@ -276,40 +302,55 @@ void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *wei _aux_mem[TempResult] = asm_mem_req[TempResult]; // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. - _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment); - _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment); - _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size)); - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment); - if(_data_layout == DataLayout::NCHW) + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, + wds.input_matrix_size_bytes, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, + wds.output_matrix_size_bytes, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, + std::max(input_workspace_size, output_workspace_size)); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, + wds.weight_matrix_size_bytes, storage_alignment); + if (_data_layout == DataLayout::NCHW) { _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); } } } -Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +Status CpuWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); // Disable winograd for fp16 if fast math is false. - if(!enable_fast_math) + if (!enable_fast_math) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); } - const Tensor4DShape kernel_shape{ internal_get_shape(weights) }; + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; arm_conv::winograd::WinogradImpl winograd_impl{}; std::unique_ptr<arm_conv::ConvolutionArgs> conv_args; - const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str()); - ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str()); + const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &winograd_impl, conv_args); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); return Status{}; } @@ -328,24 +369,29 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory. CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); - CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true); + CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, + tensors, true); CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); const bool is_nchw = _data_layout == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}}; _permute_input->run(pack); } - CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true); + CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, + tensors, true); CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); - ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } }; + ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src}, + {ACL_DST, winograd_input_transformed.get()}, + {ACL_INT, input_workspace.get()}}; NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack); - CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true); + CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, + tensors, true); // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs ITensorPack gemm_pack = tensors; @@ -356,30 +402,34 @@ void CpuWinogradConv2d::run(ITensorPack &tensors) _gemm_function->run(gemm_pack); // Output transform - ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } }; + ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()}, + {ACL_DST, is_nchw ? output_nhwc.get() : output}, + {ACL_SRC_1, biases}, + {ACL_INT, output_workspace.get()}}; NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack); - if(is_nchw) + if (is_nchw) { // Reorder the convoluted output to ACL's ordering NCHW - ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}}; _permute_output->run(pack); } - if(_run_activation) + if (_run_activation) { - ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } }; + ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}}; _activation_func->run(pack); } } void CpuWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; _permute_weights->run(permute_tensors); const int element_size_in_bytes = permuted_weights.get()->info()->element_size(); // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format. @@ -387,31 +437,32 @@ void CpuWinogradConv2d::prepare(ITensorPack &tensors) const unsigned int width_idx = 2; // W in HWIO const unsigned int channel_idx = 1; // I in HWIO - const int permuted_weight_row_stride = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; - const int permuted_weight_col_stride = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; - const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; + const int permuted_weight_row_stride = + permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; + const int permuted_weight_col_stride = + permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; + const int permuted_weight_channel_stride = + permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory. - ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); + ITensor *weights_transf = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf); const void *permuted_weights_ptr; void *win_wght_transf_ptr; - permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); - win_wght_transf_ptr = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); + permuted_weights_ptr = reinterpret_cast<const void *>( + permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); + win_wght_transf_ptr = + reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); // Prepare Weights _winograd_impl.weight_transform->execute( - *_conv_args, - permuted_weights_ptr, - permuted_weight_row_stride, - permuted_weight_col_stride, - permuted_weight_channel_stride, - win_wght_transf_ptr, - _winograd_impl.winograd_spec, - 0, 1 // Thread 1 of 1 + *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride, + permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1 ); ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get()); diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h index e0df34e2db..7e1d952462 100644 --- a/src/cpu/operators/CpuWinogradConv2d.h +++ b/src/cpu/operators/CpuWinogradConv2d.h @@ -26,10 +26,11 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" -#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/kernels/assembly/gemm_common.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuActivation.h" #include "src/cpu/operators/CpuGemm.h" #include "src/cpu/operators/CpuPermute.h" @@ -73,7 +74,11 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d @@ -82,13 +87,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -108,27 +117,28 @@ private: PermutedOutput = TransformedInput, Count = 10 }; - std::unique_ptr<CpuGemm> _gemm_function; - std::unique_ptr<CpuActivation> _activation_func; - std::unique_ptr<ICPPKernel> _transform_input_kernel; - std::unique_ptr<ICPPKernel> _transform_output_kernel; - std::unique_ptr<CpuPermute> _permute_input; - std::unique_ptr<CpuPermute> _permute_output; - std::unique_ptr<CpuPermute> _permute_weights; - experimental::MemoryRequirements _aux_mem{ Count }; - std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor - arm_conv::winograd::WinogradImpl _winograd_impl; - DataLayout _data_layout; - TensorInfo _winograd_transformed_input; - TensorInfo _winograd_transformed_output; - TensorInfo _winograd_transformed_weights; - TensorInfo _input_workspace; - TensorInfo _output_workspace; - TensorInfo _weights_hwio; - TensorInfo _input_nhwc; - TensorInfo _output_nhwc; - bool _is_prepared; - bool _run_activation; + std::unique_ptr<CpuGemm> _gemm_function; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<ICPPKernel> _transform_input_kernel; + std::unique_ptr<ICPPKernel> _transform_output_kernel; + std::unique_ptr<CpuPermute> _permute_input; + std::unique_ptr<CpuPermute> _permute_output; + std::unique_ptr<CpuPermute> _permute_weights; + experimental::MemoryRequirements _aux_mem{Count}; + std::unique_ptr<arm_conv::ConvolutionArgs> + _conv_args; // Make it unique ptr because this type does not have a default constructor + arm_conv::winograd::WinogradImpl _winograd_impl; + DataLayout _data_layout; + TensorInfo _winograd_transformed_input; + TensorInfo _winograd_transformed_output; + TensorInfo _winograd_transformed_weights; + TensorInfo _input_workspace; + TensorInfo _output_workspace; + TensorInfo _weights_hwio; + TensorInfo _input_nhwc; + TensorInfo _output_nhwc; + bool _is_prepared; + bool _run_activation; }; } // namespace cpu } // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 3069d6b541..343ef21c0b 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -24,12 +24,13 @@ #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" #include "src/core/utils/AssemblyUtils.h" -#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" #include "src/cpu/utils/CpuAuxTensorHandler.h" #include <arm_neon.h> @@ -53,7 +54,12 @@ namespace * @param[in] num_threads Number of threads to run this method. Must be >= 1 */ template <typename TypeInput, typename TypeOutput> -void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads) +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, + ITensor *dst, + const TypeInput *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads) { ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); ARM_COMPUTE_ERROR_ON(num_threads == 0); @@ -61,14 +67,14 @@ void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutpu const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size(); std::vector<IScheduler::Workload> workloads(num_threads); - for(unsigned int t = 0; t < num_threads; ++t) + for (unsigned int t = 0; t < num_threads; ++t) { - workloads[t] = [ = ](const ThreadInfo & info) + workloads[t] = [=](const ThreadInfo &info) { const unsigned int start = (info.thread_id * wsize) / num_threads; const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; - if(start < end) + if (start < end) { gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end); } @@ -113,7 +119,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen p.sections = 1; p.indirect = false; - if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) { p.indirect = true; p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; @@ -125,7 +131,7 @@ Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITen } // Update M in case of GEMM3D for output - if(info.depth_output_gemm3d != 0) + if (info.depth_output_gemm3d != 0) { p.M = d->tensor_shape().y() * d->tensor_shape().z(); p.batches = d->tensor_shape().total_size_upper(3) / p.multis; @@ -139,19 +145,24 @@ IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataTyp // Schedule assembly kernel const int granule_threshold = 200; IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) { scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) + else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && + (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S8)) { //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } - else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && + (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) { //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); } return scheduling_hint; @@ -175,8 +186,12 @@ public: * @param[in] gemm_info GEMM meta-data * @param[in] os Output stage meta-data. */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os = {}); /** Set requantization shifts to be used @@ -193,19 +208,20 @@ public: * * @return A tuple with the pointers to the shift and multiplier data respectively */ - std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts, - const std::vector<int32_t> &multipliers); + std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> + set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; bool is_configured() const override; experimental::MemoryRequirements workspace() const override; bool isVarWeightsKernel() const override { - if(!_gemm_kernel_asm) + if (!_gemm_kernel_asm) return false; - const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); + const arm_compute::WeightFormat wf = + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; } @@ -229,15 +245,15 @@ private: void prepare_indirect_buffer(ITensorPack &tensors); /** Assembly Gemm kernel */ - std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr }; + std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr}; /** Optimised Arm® Neon™ kernel */ - std::unique_ptr<INEKernel> _optimised_kernel{ nullptr }; + std::unique_ptr<INEKernel> _optimised_kernel{nullptr}; /** Assembly GEMM workspace tensor info */ TensorInfo _workspace_info{}; /** Pre-transpose tensor info */ TensorInfo _pretranspose_info{}; /** Prepared flag */ - bool _is_prepared{ false }; + bool _is_prepared{false}; /** GEMM meta-data */ AsmGemmInfo _gemm_info{}; /** GEMM kernel description */ @@ -251,26 +267,27 @@ private: /** Indirect buffer */ std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; - std::vector<TypeInput> _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - experimental::MemoryRequirements _aux_mem{ Count }; - bool _B_pretranspose_required{ false }; - bool _is_b_constant{ true }; - bool _is_c_constant{ true }; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; }; template <typename TypeInput, typename TypeOutput, class OutputStage> std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> -Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers) +Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, + const std::vector<int32_t> &multipliers) { _multipliers = multipliers; _shifts = shifts; bool need_left = false; - for(const auto s : _shifts) + for (const auto s : _shifts) { left_shifts.push_back(std::max(-s, int32_t(0))); right_shifts.push_back(std::min(-s, int32_t(0))); - if(s < 0 && !need_left) + if (s < 0 && !need_left) { need_left = true; } @@ -295,32 +312,35 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInput); - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) + for (int64_t output_y = 0; output_y < _cp.output_height; output_y++) { - for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) + for (int64_t output_x = 0; output_x < _cp.output_width; output_x++) { int64_t output_xy = (output_y * _cp.output_width) + output_x; - for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) { - for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) { int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; int64_t input_xy = (input_y * _cp.input_width) + input_x; - if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_pad.data(); } else { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); } } @@ -332,12 +352,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITens } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); float zeropad = 0.f; - if(is_data_type_quantized(a->data_type())) + if (is_data_type_quantized(a->data_type())) { zeropad = a->quantization_info().uniform().offset; } @@ -350,16 +373,25 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); - _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, - info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad - }; - - if(info.method == AsmConvMethod::Conv) + _cp = {input_width, + input_height, + input_channels, + kernel_width, + kernel_height, + output_width, + output_height, + info.ps_info.stride().first, + info.ps_info.stride().second, + info.padding_top, + info.padding_left, + zeropad}; + + if (info.method == AsmConvMethod::Conv) { _gemm_kernel_asm->set_convolution_parameters(_cp); } - if(info.method == AsmConvMethod::Indirect) + if (info.method == AsmConvMethod::Indirect) { const unsigned int multis = 1; const unsigned int batches = a->tensor_shape().total_size_upper(3); @@ -372,19 +404,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen const int multi_size = batch_size * batches; const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>( + reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>( + reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); // Set indirect argument int64_t pos = 0; - for(int64_t m = 0; m < multis; m++) + for (int64_t m = 0; m < multis; m++) { - for(int64_t b = 0; b < batches; b++) + for (int64_t b = 0; b < batches; b++) { - for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) { - (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + (_indirect_arg.get())[pos++] = + _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; } } } @@ -394,8 +429,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen } template <typename TypeInput, typename TypeOutput, class OutputStage> -void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, +void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, const OutputStage &os) { ARM_COMPUTE_UNUSED(c); @@ -404,7 +443,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * _is_c_constant = c ? c->are_values_constant() : true; _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); - if(_gemm_kernel_asm == nullptr) + if (_gemm_kernel_asm == nullptr) { //configuration not supported: Leave function unconfigured: return; @@ -419,13 +458,14 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * const size_t workspace_size = _gemm_kernel_asm->get_working_size(); const unsigned int alignment = 4096; _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); - _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); + _aux_mem[AsmGemmWorkspace] = + MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 { const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - if(window_size < static_cast<unsigned int>(args._maxthreads)) + if (window_size < static_cast<unsigned int>(args._maxthreads)) { _gemm_kernel_asm->set_nthreads(window_size); } @@ -434,18 +474,19 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * _optimised_kernel = std::move(acl_gemm_wrapper); _gemm_info = gemm_info; // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Forcing 128-byte alignment (required by 32-bit kernels) const unsigned int alignment = 128; const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); - _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); - _B_pretranspose_required = true; + _aux_mem[Pretranspose] = + MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); + _B_pretranspose_required = true; } // Handle indirect GEMM convolution - if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) { configure_indirect(a, b, d, gemm_info); } @@ -454,34 +495,39 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * template <typename TypeInput, typename TypeOutput, class OutputStage> void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) + if (_gemm_kernel_asm->B_pretranspose_required()) { // Fixed format kernels need no pretranspose. - ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto in1_ptr = + reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), + in1_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); b->mark_as_unused(); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { prepare_indirect_buffer(tensors); } @@ -526,12 +572,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) int multi_stride_b = 0; const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); - auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); + auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); const TypeInput *in1_ptr = nullptr; auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) + if (!_gemm_kernel_asm->B_is_pretransposed()) { ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); @@ -539,30 +585,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) } // If necessary, run pretranspose every time if either weights or biases are non-constant - if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) + if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) { - if(c && c->info()->data_type() == DataType::S32) + if (c && c->info()->data_type() == DataType::S32) { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); } // Pretranspose B if required - if(_B_pretranspose_required) + if (_B_pretranspose_required) { - const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); - const auto b_ptr = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); + const int ldb = b->info()->strides_in_bytes().y() / b->info()->element_size(); + const auto b_ptr = + reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size(); CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - if(_is_b_constant) + if (_is_b_constant) { _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); } else { - run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads()); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), + b_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads()); } } } @@ -571,17 +621,17 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); - if(workspace.get()->buffer() != nullptr) + if (workspace.get()->buffer() != nullptr) { _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer())); const unsigned int split_dim = scheduling_hint.split_dimension(); const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) + if (window_size < num_threads) { num_threads = window_size; } - if(split_dim != IScheduler::split_dimensions_all) + if (split_dim != IScheduler::split_dimensions_all) { // Make sure the kernel does not expect more threads than we can actually spawn const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); @@ -595,12 +645,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. TypeOutput *bias = nullptr; - if(c && c->info()->data_type() != DataType::S32) + if (c && c->info()->data_type() != DataType::S32) { bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); } - if(_gemm_info.method == AsmConvMethod::Indirect) + if (_gemm_info.method == AsmConvMethod::Indirect) { in0_ptr = nullptr; lda = 0; @@ -609,18 +659,20 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) } // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); + _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, + ldd, batch_stride_d, multi_stride_d, bias, 0); // Schedule NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); } template <typename TypeInput, typename TypeOutput> void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { Params p = extract_parameters(a, b, d, info); const CPUInfo &ci = NEScheduler::get().cpu_info(); @@ -628,7 +680,8 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); @@ -638,8 +691,12 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge template <typename TypeInput, typename TypeOutput> void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); Params p = extract_parameters(a, b, d, info); @@ -648,7 +705,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); @@ -660,22 +718,20 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & const GEMMLowpOutputStageInfo os_info = info.output_stage; arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) + if (os_info.gemmlowp_shifts.size() > 1) { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, - std::get<2>(requantize_data), - std::get<3>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + const auto requantize_data = + fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } else { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + gemm_requant_info = + arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift, + os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); } // Configure fallback @@ -684,13 +740,16 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & } } //namespace -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() - : _arm_gemm(nullptr) +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr) { } -Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_UNUSED(c); @@ -701,53 +760,61 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg); - switch(a->data_type()) + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, + info.fixed_format, info.fast_mode, &cfg); + switch (a->data_type()) { case DataType::F32: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F32 input"); break; #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for U8 input and U8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8 input and U8 output"); } break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for S8 input and S8 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8 input and S8 output"); } break; #endif /* __aarch64__ */ #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for BFLOAT16 input and F32 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and F32 output"); break; } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), - "We could not find an optimized kernel for F16 input and F16 output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F16 input and F16 output"); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: @@ -759,26 +826,30 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected return Status{}; } -Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) +Status CpuGemmAssemblyDispatch::validate( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(c, info); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), + "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); #ifndef __aarch64__ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); #endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::BFLOAT16, DataType::F16, DataType::F32); + if (is_data_type_quantized_per_channel(b->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); } - else if(is_fixed_format_fast_math(info.weight_format)) + else if (is_fixed_format_fast_math(info.weight_format)) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); @@ -787,22 +858,29 @@ Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, + "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, + "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, + "Only F32 output supported for BFLOAT16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, + "Only U32 output supported for U8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, + "Only S32 output supported for S8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), "Only QASYMM8/S32 output supported for QASYMM8 input"); arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); - if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) { // Correctness check: if the format expected by the kernel is // not "any", make sure that the one found matches the format // intended by the caller. - ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format), - "The format expected by the kernel does not correspond with the one requested by the user."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (expected_weight_format != info.weight_format), + "The format expected by the kernel does not correspond with the one requested by the user."); } return ret; } @@ -813,18 +891,19 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo return act.type != arm_gemm::Activation::Type::None; } -void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) +void CpuGemmAssemblyDispatch::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) + if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) { return; } - switch(a->data_type()) + switch (a->data_type()) { case DataType::F32: create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info); @@ -832,7 +911,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo #ifdef __aarch64__ case DataType::U8: case DataType::QASYMM8: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); } @@ -843,7 +922,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo break; case DataType::S8: case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) + if (d->data_type() == DataType::S32) { create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); } diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index ceb7a3f775..5be39a54c0 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/cpu/ICpuOperator.h" @@ -42,20 +43,20 @@ enum class AsmConvMethod struct AsmGemmInfo { - AsmConvMethod method{ AsmConvMethod::Im2Col }; + AsmConvMethod method{AsmConvMethod::Im2Col}; PadStrideInfo ps_info{}; ActivationLayerInfo activation_info{}; GEMMLowpOutputStageInfo output_stage{}; - bool negated_offsets{ true }; - bool reinterpret_input_as_3d{ false }; - bool depth_output_gemm3d{ false }; - int64_t padding_top{ 0 }; - int64_t padding_left{ 0 }; - float padding_value{ 0.f }; - bool fast_mode{ false }; - bool fixed_format{ false }; - arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED }; - bool reshape_b_only_on_first_run{ true }; + bool negated_offsets{true}; + bool reinterpret_input_as_3d{false}; + bool depth_output_gemm3d{false}; + int64_t padding_top{0}; + int64_t padding_left{0}; + float padding_value{0.f}; + bool fast_mode{false}; + bool fixed_format{false}; + arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; + bool reshape_b_only_on_first_run{true}; }; /** Assembly kernel glue */ @@ -72,12 +73,12 @@ public: class IFallback { public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual experimental::MemoryRequirements workspace() const = 0; - virtual bool is_configured() const = 0; - virtual bool isVarWeightsKernel() const = 0; - virtual ~IFallback() = default; + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual ~IFallback() = default; }; public: @@ -121,7 +122,8 @@ public: * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. * @param[in] info GEMM meta-data */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); + void configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); /** Indicates whether or not this function can be used to process the given parameters. * @@ -133,7 +135,11 @@ public: * * @return a status. */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. * @@ -144,7 +150,12 @@ public: * * @return a status. */ - static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); /** Checks if activation is supported by the gemm assembly dispatcher * * @param[in] activation Activation to check @@ -167,8 +178,8 @@ public: } // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h index ae1cffb659..e23b88a777 100644 --- a/src/cpu/utils/CpuAuxTensorHandler.h +++ b/src/cpu/utils/CpuAuxTensorHandler.h @@ -39,25 +39,26 @@ namespace cpu class CpuAuxTensorHandler { public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + CpuAuxTensorHandler( + int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { - if(info.total_size() == 0) + if (info.total_size() == 0) { return; } _tensor.allocator()->soft_init(info); ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - if(!bypass_alloc) + if (!bypass_alloc) { _tensor.allocator()->allocate(); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); } - if(pack_inject) + if (pack_inject) { pack.add_tensor(slot_id, &_tensor); _injected_tensor_pack = &pack; @@ -70,22 +71,21 @@ public: } } - CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) - : _tensor() + CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor() { _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) + if (info.total_size() <= tensor.info()->total_size()) { _tensor.allocator()->import_memory(tensor.buffer()); } } - CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; + CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; ~CpuAuxTensorHandler() { - if(_injected_tensor_pack) + if (_injected_tensor_pack) { _injected_tensor_pack->remove_tensor(_injected_slot_id); } @@ -103,9 +103,9 @@ public: private: Tensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; + ITensorPack *_injected_tensor_pack{nullptr}; + int _injected_slot_id{TensorType::ACL_UNKNOWN}; }; } // namespace cpu } // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ |