aboutsummaryrefslogtreecommitdiff
path: root/src/core/cpu/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/cpu/kernels')
-rw-r--r--src/core/cpu/kernels/CpuActivationKernel.cpp257
-rw-r--r--src/core/cpu/kernels/CpuActivationKernel.h73
-rw-r--r--src/core/cpu/kernels/CpuAddKernel.cpp342
-rw-r--r--src/core/cpu/kernels/CpuAddKernel.h85
-rw-r--r--src/core/cpu/kernels/CpuCastKernel.cpp1367
-rw-r--r--src/core/cpu/kernels/CpuCastKernel.h82
-rw-r--r--src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp216
-rw-r--r--src/core/cpu/kernels/CpuConcatenateBatchKernel.h78
-rw-r--r--src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp212
-rw-r--r--src/core/cpu/kernels/CpuConcatenateDepthKernel.h83
-rw-r--r--src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp183
-rw-r--r--src/core/cpu/kernels/CpuConcatenateHeightKernel.h72
-rw-r--r--src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp180
-rw-r--r--src/core/cpu/kernels/CpuConcatenateWidthKernel.h72
-rw-r--r--src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp137
-rw-r--r--src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h89
-rw-r--r--src/core/cpu/kernels/CpuCopyKernel.cpp166
-rw-r--r--src/core/cpu/kernels/CpuCopyKernel.h69
-rw-r--r--src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp919
-rw-r--r--src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h109
-rw-r--r--src/core/cpu/kernels/CpuDequantizeKernel.cpp400
-rw-r--r--src/core/cpu/kernels/CpuDequantizeKernel.h64
-rw-r--r--src/core/cpu/kernels/CpuDirectConv2dKernel.cpp1385
-rw-r--r--src/core/cpu/kernels/CpuDirectConv2dKernel.h93
-rw-r--r--src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp513
-rw-r--r--src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h87
-rw-r--r--src/core/cpu/kernels/CpuElementwiseKernel.cpp354
-rw-r--r--src/core/cpu/kernels/CpuElementwiseKernel.h239
-rw-r--r--src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp174
-rw-r--r--src/core/cpu/kernels/CpuElementwiseUnaryKernel.h90
-rw-r--r--src/core/cpu/kernels/CpuFillKernel.cpp90
-rw-r--r--src/core/cpu/kernels/CpuFillKernel.h60
-rw-r--r--src/core/cpu/kernels/CpuFloorKernel.cpp175
-rw-r--r--src/core/cpu/kernels/CpuFloorKernel.h72
-rw-r--r--src/core/cpu/kernels/CpuMulKernel.cpp1729
-rw-r--r--src/core/cpu/kernels/CpuMulKernel.h150
-rw-r--r--src/core/cpu/kernels/CpuPermuteKernel.cpp301
-rw-r--r--src/core/cpu/kernels/CpuPermuteKernel.h73
-rw-r--r--src/core/cpu/kernels/CpuPool2dKernel.cpp514
-rw-r--r--src/core/cpu/kernels/CpuPool2dKernel.h78
-rw-r--r--src/core/cpu/kernels/CpuQuantizeKernel.cpp266
-rw-r--r--src/core/cpu/kernels/CpuQuantizeKernel.h90
-rw-r--r--src/core/cpu/kernels/CpuReshapeKernel.cpp140
-rw-r--r--src/core/cpu/kernels/CpuReshapeKernel.h65
-rw-r--r--src/core/cpu/kernels/CpuScaleKernel.cpp621
-rw-r--r--src/core/cpu/kernels/CpuScaleKernel.h111
-rw-r--r--src/core/cpu/kernels/CpuSoftmaxKernel.cpp389
-rw-r--r--src/core/cpu/kernels/CpuSoftmaxKernel.h107
-rw-r--r--src/core/cpu/kernels/CpuSubKernel.cpp246
-rw-r--r--src/core/cpu/kernels/CpuSubKernel.h98
-rw-r--r--src/core/cpu/kernels/CpuTransposeKernel.cpp510
-rw-r--r--src/core/cpu/kernels/CpuTransposeKernel.h64
-rw-r--r--src/core/cpu/kernels/activation/list.h49
-rw-r--r--src/core/cpu/kernels/activation/neon/fp16.cpp217
-rw-r--r--src/core/cpu/kernels/activation/neon/fp32.cpp212
-rw-r--r--src/core/cpu/kernels/activation/neon/qasymm8.cpp262
-rw-r--r--src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp261
-rw-r--r--src/core/cpu/kernels/activation/neon/qsymm16.cpp138
-rw-r--r--src/core/cpu/kernels/activation/sve/fp16.cpp130
-rw-r--r--src/core/cpu/kernels/activation/sve/fp32.cpp131
-rw-r--r--src/core/cpu/kernels/activation/sve/qasymm8.cpp254
-rw-r--r--src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp253
-rw-r--r--src/core/cpu/kernels/activation/sve/qsymm16.cpp120
-rw-r--r--src/core/cpu/kernels/add/neon/integer.cpp170
-rw-r--r--src/core/cpu/kernels/add/neon/list.h146
-rw-r--r--src/core/cpu/kernels/add/neon/qasymm8.cpp209
-rw-r--r--src/core/cpu/kernels/add/neon/qasymm8_signed.cpp208
-rw-r--r--src/core/cpu/kernels/add/neon/qsymm16.cpp174
-rw-r--r--src/core/cpu/kernels/add/sve/impl.cpp139
-rw-r--r--src/core/cpu/kernels/add/sve/impl.h40
-rw-r--r--src/core/cpu/kernels/add/sve/integer.cpp201
-rw-r--r--src/core/cpu/kernels/add/sve/list.h54
-rw-r--r--src/core/cpu/kernels/add/sve/qasymm8.cpp182
-rw-r--r--src/core/cpu/kernels/add/sve/qasymm8_signed.cpp181
-rw-r--r--src/core/cpu/kernels/add/sve/qsymm16.cpp156
-rw-r--r--src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h126
-rw-r--r--src/core/cpu/kernels/assembly/arm_gemm.hpp190
-rw-r--r--src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp130
-rw-r--r--src/core/cpu/kernels/assembly/arm_gemm_local.hpp31
-rw-r--r--src/core/cpu/kernels/assembly/convolution_parameters.hpp65
-rw-r--r--src/core/cpu/kernels/assembly/gemm_common.hpp229
-rw-r--r--src/core/cpu/kernels/assembly/ndrange.hpp199
-rw-r--r--src/core/cpu/kernels/elementwise/neon/elementwise_list.h486
-rw-r--r--src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h654
-rw-r--r--src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h116
-rw-r--r--src/core/cpu/kernels/elementwise/sve/elementwise.cpp311
-rw-r--r--src/core/cpu/kernels/elementwise/sve/elementwise_list.h171
-rw-r--r--src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h366
-rw-r--r--src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp113
-rw-r--r--src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h39
-rw-r--r--src/core/cpu/kernels/floor/list.h41
-rw-r--r--src/core/cpu/kernels/floor/neon/fp16.cpp64
-rw-r--r--src/core/cpu/kernels/floor/neon/fp32.cpp61
-rw-r--r--src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp276
-rw-r--r--src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h119
-rw-r--r--src/core/cpu/kernels/pooling/neon/fp16.cpp317
-rw-r--r--src/core/cpu/kernels/pooling/neon/fp32.cpp314
-rw-r--r--src/core/cpu/kernels/pooling/neon/list.h97
-rw-r--r--src/core/cpu/kernels/pooling/neon/nchw/all.cpp700
-rw-r--r--src/core/cpu/kernels/pooling/neon/qasymm8.cpp41
-rw-r--r--src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp41
-rw-r--r--src/core/cpu/kernels/pooling/neon/quantized.h863
-rw-r--r--src/core/cpu/kernels/scale/neon/fp16.cpp174
-rw-r--r--src/core/cpu/kernels/scale/neon/integer.cpp293
-rw-r--r--src/core/cpu/kernels/scale/neon/list.h185
-rw-r--r--src/core/cpu/kernels/scale/neon/qasymm8.cpp145
-rw-r--r--src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp145
-rw-r--r--src/core/cpu/kernels/scale/sve/fp16.cpp176
-rw-r--r--src/core/cpu/kernels/scale/sve/fp32.cpp174
-rw-r--r--src/core/cpu/kernels/scale/sve/integer.cpp300
-rw-r--r--src/core/cpu/kernels/scale/sve/list.h47
-rw-r--r--src/core/cpu/kernels/scale/sve/qasymm8.cpp207
-rw-r--r--src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp207
-rw-r--r--src/core/cpu/kernels/softmax/impl/neon/list.h388
-rw-r--r--src/core/cpu/kernels/softmax/impl/sve/impl.cpp185
-rw-r--r--src/core/cpu/kernels/softmax/impl/sve/list.h223
-rw-r--r--src/core/cpu/kernels/sub/neon/integer.cpp183
-rw-r--r--src/core/cpu/kernels/sub/neon/list.h162
-rw-r--r--src/core/cpu/kernels/sub/neon/qasymm8.cpp230
-rw-r--r--src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp229
-rw-r--r--src/core/cpu/kernels/sub/neon/qsymm16.cpp201
121 files changed, 0 insertions, 28040 deletions
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
deleted file mode 100644
index 8a57a3b529..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ /dev/null
@@ -1,257 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuActivationKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/activation/list.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ActivationSelectorData
-{
- DataType dt;
-};
-
-using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type;
-using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
-
-struct ActivationKernel
-{
- const char *name;
- const ActivationSelectorPtr is_selected;
- ActivationKernelPtr ukernel;
-};
-
-static const ActivationKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "fp16_sve_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation)
- },
- {
- "fp32_sve_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
- },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
- {
- "fp16_neon_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation)
- },
- {
- "fp32_neon_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
- },
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
- {
- "qasymm8_sve_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation)
- },
- {
- "qasymm8_signed_sve_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation)
- },
- {
- "qsymm16_sve_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation)
- },
-#else /* !defined(__ARM_FEATURE_SVE2) */
- {
- "qasymm8_neon_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation)
- },
- {
- "qasymm8_signed_neon_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation)
- },
- {
- "qsymm16_neon_activation",
- [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
- },
-#endif /* defined(__ARM_FEATURE_SVE2) */
-};
-
-const ActivationKernel *get_implementation(const ActivationSelectorData &data)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected(data))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-/* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations =
-{
- ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH,
- ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-};
-/* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 3> qsymm16_activations =
-{
- ActivationLayerInfo::ActivationFunction::LOGISTIC,
- ActivationLayerInfo::ActivationFunction::TANH,
- ActivationLayerInfo::ActivationFunction::HARD_SWISH
-};
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
-
- const auto *uk = get_implementation(ActivationSelectorData{ src->data_type() });
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- const DataType data_type = src->data_type();
- const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
- const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
- "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
- "For QSYMM16 only tanh and logistic are supported");
- ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
- && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
- ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
-
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-
- // Checks performed when dst is configured
- if((dst != nullptr) && (dst->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
-
- if(dst != nullptr)
- {
- // dst auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, *src->clone());
- }
-
- return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-
- _act_info = activation_info;
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICPPKernel::configure(win_config.second);
-}
-
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_UNUSED(act_info);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
-
- return Status{};
-}
-
-void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- // Early exit on disabled activation
- if(!_act_info.enabled())
- {
- return;
- }
-
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- ARM_COMPUTE_ERROR_ON(tensors.empty());
-
- const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
-
- const auto *uk = get_implementation(ActivationSelectorData{ src->info()->data_type() });
-
- uk->ukernel(src, dst, _act_info, window);
-}
-
-const char *CpuActivationKernel::name() const
-{
- return "CpuActivationKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuActivationKernel.h b/src/core/cpu/kernels/CpuActivationKernel.h
deleted file mode 100644
index de71014303..0000000000
--- a/src/core/cpu/kernels/CpuActivationKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the activation kernel */
-class CpuActivationKernel : public ICpuKernel
-{
-public:
- CpuActivationKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel);
- /** Configure kernel for a given list of arguments
- *
- * @note If the output tensor is a nullptr, the activation function will be performed in-place
- *
- * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[out] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] activation_info Activation layer information.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuActivationKernel
- *
- * @param[in] src Source tensor info. In case of @p dst tensor info = nullptr, this tensor will store the result
- * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
- * @param[in] dst Destination tensor info. Data type supported: same as @p src
- * @param[in] act_info Activation layer information.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- ActivationLayerInfo _act_info{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
deleted file mode 100644
index 7afdceae38..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuAddKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/add/neon/list.h"
-#include "src/core/cpu/kernels/add/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <array>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct AddSelectorData
-{
- DataType dt1;
- DataType dt2;
- DataType dt3;
-};
-
-using AddSelectorPtr = std::add_pointer<bool(const AddSelectorData &data)>::type;
-using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-struct AddKernel
-{
- const char *name;
- const AddSelectorPtr is_selected;
- AddKernelPtr ukernel;
-};
-
-static const AddKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "add_same_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
- REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
- },
- {
- "add_same_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
- REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
- },
- {
- "add_same_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
- },
- {
- "add_same_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
- },
- {
- "add_same_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
- },
- {
- "add_u8_s16_s16_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve)
- },
- {
- "add_s16_u8_s16_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve)
- },
- {
- "add_u8_u8_s16_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
- REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
- },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
- {
- "add_same_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "add_same_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
- REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "add_same_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
- },
- {
- "add_same_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
- },
- {
- "add_same_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
- },
- {
- "add_u8_s16_s16_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon)
- },
- {
- "add_s16_u8_s16_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon)
- },
- {
- "add_u8_u8_s16_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
- },
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
- {
- "add_qasymm8_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
- },
- {
- "add_qasymm8_signed_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
- },
- {
- "add_qsymm16_sve",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
- REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
- },
-#else /* !defined(__ARM_FEATURE_SVE2) */
- {
- "add_qasymm8_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
- },
- {
- "add_qasymm8_signed_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
- },
- {
- "add_qsymm16_neon",
- [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
- },
-#endif /* defined(ENABLE_NEON) */
-
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const AddKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected({ dt1, dt2, dt3 }))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::S32, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16, DataType::F16,
- DataType::S32, DataType::F32);
-
- const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
- || (src1.data_type() != dst.data_type())),
- "Broadcasting across width is supported on configurations where all tensors have the same data type");
-
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
- && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
- && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16)
- && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
- && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
- && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16),
- "You called addition with the wrong image formats");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
- "Wrong shape for dst");
- }
-
- const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst)
-{
- const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
- // Auto initialize dst if not initialized
- {
- set_shape_if_empty(dst, out_shape);
-
- if(src0.data_type() == DataType::S16 || src1.data_type() == DataType::S16)
- {
- set_format_if_unknown(dst, Format::S16);
- }
- if(src0.data_type() == DataType::S32 || src1.data_type() == DataType::S32)
- {
- set_format_if_unknown(dst, Format::S32);
- }
- else if(src0.data_type() == DataType::F16 || src1.data_type() == DataType::F16)
- {
- set_format_if_unknown(dst, Format::F16);
- }
- else if(src0.data_type() == DataType::F32 || src1.data_type() == DataType::F32)
- {
- set_format_if_unknown(dst, Format::F32);
- }
- else if(src0.data_type() == DataType::QASYMM8 || src1.data_type() == DataType::QASYMM8)
- {
- set_data_type_if_unknown(dst, DataType::QASYMM8);
- }
- else if(src0.data_type() == DataType::QASYMM8_SIGNED || src1.data_type() == DataType::QASYMM8_SIGNED)
- {
- set_data_type_if_unknown(dst, DataType::QASYMM8_SIGNED);
- }
- else if(src0.data_type() == DataType::QSYMM16 || src1.data_type() == DataType::QSYMM16)
- {
- set_data_type_if_unknown(dst, DataType::QSYMM16);
- }
- }
-
- Window win = calculate_max_window(out_shape, Steps());
-
- // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped
- return std::make_pair(Status{}, win);
-}
-} // namespace
-
-void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
- _policy = policy;
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(*src0, *src1, *dst);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICpuKernel::configure(win_config.second);
-}
-
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first);
-
- return Status{};
-}
-
-void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- ARM_COMPUTE_ERROR_ON(tensors.empty());
-
- const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
-
- const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
- ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- uk->ukernel(src0, src1, dst, _policy, window);
-}
-
-const char *CpuAddKernel::name() const
-{
- return "CpuAddKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h
deleted file mode 100644
index a36ec7ad65..0000000000
--- a/src/core/cpu/kernels/CpuAddKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPUADDKERNEL_H
-#define ARM_COMPUTE_CPUADDKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform addition between two tensors */
-class CpuAddKernel : public ICpuKernel
-{
-public:
- CpuAddKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel);
- /** Initialise the kernel's input, dst and border mode.
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- * - (QASYMM8,QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (QSYMM16,QSYMM16) -> QSYMM16
- *
- * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
- * @param[in] policy Overflow policy.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuAddKernel
- *
- * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
- * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
- * @param[in] policy Overflow policy.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- ConvertPolicy _policy{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPUADDKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuCastKernel.cpp b/src/core/cpu/kernels/CpuCastKernel.cpp
deleted file mode 100644
index 46f3c330ef..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.cpp
+++ /dev/null
@@ -1,1367 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCastKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst);
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
- DataType::F32, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
- DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16,
- DataType::U32, DataType::S32, DataType::F32);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
- && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
- "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
- "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
- "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
- "Only data_types supported [in] U16 -> [out] U8, U32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
- "Only data_types supported [in] S16 -> [out] U8, S32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
- "Only data_types supported [in] BFLOAT16 -> [out] F32");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::U8
- && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
- "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
- && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
- "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
- && dst->data_type() != DataType::F16
- && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
- "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
-
- // Validate in case of configured dst
- if(dst->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- }
-
- return Status{};
-}
-} // namespace
-
-void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
- set_shape_if_empty(*dst, src->tensor_shape());
-
- _policy = policy;
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
-
- ICPPKernel::configure(win);
-}
-
-Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
- return Status{};
-}
-
-void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const int window_step_x = 16;
-
- const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
- ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
- ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
- ARM_COMPUTE_ERROR_ON(_src == _dst);
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator src(_src, win);
- Iterator dst(_dst, win);
-
- switch(_src->info()->data_type())
- {
- case DataType::QASYMM8_SIGNED:
- {
- switch(_dst->info()->data_type())
- {
- case DataType::S16:
- {
- /* Up-conversion QASYMM8_SIGNED -> S16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
-
- vst1q_s16(dst_ptr + x, texels.val[0]);
- vst1q_s16(dst_ptr + x + 8, texels.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::S32:
- {
- /* Up-conversion QASYMM8_SIGNED -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
-
- vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
- vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::F32:
- {
- /* Up-conversion QASYMM8_SIGNED -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr()));
-
- const int16x8x2_t texels =
- {
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- /* Up-conversion QASYMM8_SIGNED -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vmovl_s8(vget_low_s8(texels_s8)),
- vmovl_s8(vget_high_s8(texels_s8))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
- }
-
- case DataType::QASYMM8:
- case DataType::U8:
- {
- switch(_dst->info()->data_type())
- {
- case DataType::S16:
- {
- /* Up-conversion U8 -> S16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
-
- vst1q_s16(dst_ptr + x, texels.val[0]);
- vst1q_s16(dst_ptr + x + 8, texels.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::S32:
- {
- /* Up-conversion U8 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
-
- vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
- vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
- vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::F32:
- {
- /* Up-conversion U8 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- /* Up-conversion U8 -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
- const int16x8x2_t texels =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
- vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
- }
- };
- vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
- vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::U16:
- {
- /* Up-conversion U8 -> U16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
-
- const uint16x8x2_t texels =
- {
- {
- vmovl_u8(vget_low_u8(texels_u8)),
- vmovl_u8(vget_high_u8(texels_u8))
- }
- };
-
- vst1q_u16(dst_ptr + x, texels.val[0]);
- vst1q_u16(dst_ptr + x + 8, texels.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
- }
- case DataType::S16:
- {
- switch(_dst->info()->data_type())
- {
- case DataType::QASYMM8_SIGNED:
- {
- /* Down-conversion S16 -> QASYMM8_SIGNED */
- if(ConvertPolicy::SATURATE == _policy)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- break;
- }
- case DataType::U8:
- {
- /* Down-conversion S16 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
- vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- break;
- }
- case DataType::S32:
- {
- /* Up-conversion S16 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t texels =
- {
- {
- vld1q_s16(src_ptr + x),
- vld1q_s16(src_ptr + x + 8)
- }
- };
-
- const int32x4x4_t texels_s32 =
- {
- {
- vmovl_s16(vget_low_s16(texels.val[0])),
- vmovl_s16(vget_high_s16(texels.val[0])),
- vmovl_s16(vget_low_s16(texels.val[1])),
- vmovl_s16(vget_high_s16(texels.val[1]))
- }
- };
-
- vst1q_s32(dst_ptr + x, texels_s32.val[0]);
- vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
- vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
- vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
- }
- case DataType::U16:
- {
- switch(_dst->info()->data_type())
- {
- case DataType::U8:
- {
- /* Down-conversion U16 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint16x8x2_t texels =
- {
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint16x8x2_t texels =
- {
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
-
- },
- src, dst);
- }
- break;
- }
- case DataType::U32:
- {
- /* Up-conversion U16 -> U32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint16x8x2_t texels =
- {
- {
- vld1q_u16(src_ptr + x),
- vld1q_u16(src_ptr + x + 8)
- }
- };
-
- vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
- vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
- vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
- vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
- }
-
- },
- src, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
- }
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
- case DataType::BFLOAT16:
- switch(_dst->info()->data_type())
- {
- case DataType::F32:
- {
- /* Up-conversion BFLOAT16 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint16x8x2_t texels =
- {
- {
- vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
- vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
- }
- };
-
- vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
- vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
- vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
- vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
- vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
- vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
- vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
- vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
- }
-
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = float(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type unsupported");
- }
- break;
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- switch(_dst->info()->data_type())
- {
- case DataType::QASYMM8_SIGNED:
- {
- /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t texels =
- {
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8),
- }
- };
-
- vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::QASYMM8:
- case DataType::U8:
- {
- /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t texels =
- {
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8),
- }
- };
-
- vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
-
- },
- src, dst);
- break;
- }
- case DataType::F32:
- {
- /* Up-conversion F16 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t texels =
- {
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
- vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
- vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
- vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::S32:
- {
- /* Up-conversion F16 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t texels =
- {
- {
- vld1q_f16(src_ptr + x),
- vld1q_f16(src_ptr + x + 8)
- }
- };
-
- vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
- vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
- vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- switch(_dst->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- /* Down-conversion F32 -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t texels =
- {
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12)
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
- case DataType::BFLOAT16:
- {
- /* Down-conversion F32 -> BFLOAT16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
- reinterpret_cast<uint16_t *>(dst.ptr()));
- wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
- reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
- }
-
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = *(src_ptr + x);
- }
- },
- src, dst);
- break;
- }
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
- case DataType::S32:
- {
- /* Conversion F32 -> S32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t texels =
- {
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12),
- }
- };
-
- vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
- vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
- vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
- vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::QASYMM8:
- case DataType::U8:
- {
- /* Down-conversion F32 -> U8 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t texels =
- {
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12),
- }
- };
-
- vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
- vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::QASYMM8_SIGNED:
- {
- /* Down-conversion F32 -> QASYMM8_SIGNED */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t texels =
- {
- {
- vld1q_f32(src_ptr + x),
- vld1q_f32(src_ptr + x + 4),
- vld1q_f32(src_ptr + x + 8),
- vld1q_f32(src_ptr + x + 12),
- }
- };
-
- vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
- vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
-
- case DataType::S32:
- switch(_dst->info()->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- /* Down-conversion S32 -> F16 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t texels =
- {
- {
- vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
- vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
- }
- };
-
- vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
- vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- {
- /* Conversion S32 -> F32 */
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12),
- }
- };
-
- vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
- vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
- vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
- vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
- }
- },
- src, dst);
- break;
- }
- case DataType::QASYMM8_SIGNED:
- {
- /* Down-conversion S32 -> QASYMM8_SIGNED */
- if(ConvertPolicy::SATURATE == _policy)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12),
- }
- };
- vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
- vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
-
- vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
- vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- break;
- }
- case DataType::QASYMM8:
- case DataType::U8:
- {
- /* Down-conversion S32 -> U8 */
- if(ConvertPolicy::SATURATE == _policy)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
- vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
- vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
- const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x4_t texels =
- {
- {
- vld1q_s32(src_ptr + x),
- vld1q_s32(src_ptr + x + 4),
- vld1q_s32(src_ptr + x + 8),
- vld1q_s32(src_ptr + x + 12)
- }
- };
-
- vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
- vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
- }
- },
- src, dst);
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("dst data type not supported");
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
-}
-
-const char *CpuCastKernel::name() const
-{
- return "CpuCastKernel.cpp";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCastKernel.h b/src/core/cpu/kernels/CpuCastKernel.h
deleted file mode 100644
index 2a75c5850e..0000000000
--- a/src/core/cpu/kernels/CpuCastKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H
-#define ARM_COMPUTE_CPU_CAST_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Casts a given tensor to a new type
- *
- * @note When casting between quantized types the scale and zeroPoint are ignored
- */
-class CpuCastKernel : public ICpuKernel
-{
-public:
- CpuCastKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel);
- /** Set the src and dst of the kernel
- *
- * Valid conversions src -> dst :
- *
- * - QASYMM8_SIGNED -> S16, S32, F32, F16
- * - QASYMM8 -> U16, S16, S32, F32, F16
- * - U8 -> U16, S16, S32, F32, F16
- * - U16 -> U8, U32
- * - S16 -> QASYMM8_SIGNED, U8, S32
- * - BFLOAT16 -> F32
- * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
- * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
- * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
- *
- * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
- * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
- * @param[in] policy Conversion policy.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuCastKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- ConvertPolicy _policy{ ConvertPolicy::SATURATE };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
deleted file mode 100644
index 5df5ac3dd0..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window)
-{
- // Offset src
- uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
- // Offset dst
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const int window_step_x = 16 / dst->info()->element_size();
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
-
- Iterator src_it(src, win);
- Iterator dst_it(dst, win);
-
- const DataType dt = src->info()->data_type();
- const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
- }
-}
-
-Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
-
- return Status{};
-}
-} // namespace
-
-CpuConcatenateBatchKernel::CpuConcatenateBatchKernel()
- : _func(nullptr), _batch_offset(0)
-{
-}
-
-void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
-
- _func = nullptr;
- _batch_offset = batch_offset;
-
- switch(src->data_type())
- {
- case DataType::S8:
- case DataType::U8:
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- _func = &batch_concat<uint8_t>;
- break;
- case DataType::S16:
- case DataType::U16:
- case DataType::F16:
- _func = &batch_concat<uint16_t>;
- break;
- case DataType::S32:
- case DataType::U32:
- case DataType::F32:
- _func = &batch_concat<uint32_t>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src,
- unsigned int batch_offset,
- const arm_compute::ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
- return Status{};
-}
-
-void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
- tensors.get_tensor(TensorType::ACL_DST),
- _batch_offset,
- window);
-}
-
-const char *CpuConcatenateBatchKernel::name() const
-{
- return "CpuConcatenateBatchKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h b/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
deleted file mode 100644
index 99e8d84d99..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateBatchKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the batch concatenate kernel.
- * The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateBatchKernel : public ICpuKernel
-{
-public:
- CpuConcatenateBatchKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor info. Data types supported: All.
- * @param[in] batch_offset The offset on axis # 3.
- * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
- */
- void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateBatchKernel
- *
- * @param[in] src Source tensor info. Data types supported: All.
- * @param[in] batch_offset The offset on axis # 3.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
- BatchConcatFunction *_func;
- unsigned int _batch_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEBATCH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
deleted file mode 100644
index a7e5cd8c60..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-template <typename T>
-void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window)
-{
- // Offset source
- uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
-
- // Offset destination
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const int window_step_x = 16 / dst->info()->element_size();
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
-
- Iterator src_it(src, win);
- Iterator dst_it(dst, win);
-
- const DataType dt = src->info()->data_type();
- const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset());
- const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
- }
-}
-
-Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY));
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
- return Status{};
-}
-} // namespace
-
-CpuConcatenateDepthKernel::CpuConcatenateDepthKernel()
- : _func(nullptr), _depth_offset(0)
-{
-}
-
-void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
-
- _func = nullptr;
- _depth_offset = depth_offset;
-
- switch(src->data_type())
- {
- case DataType::QASYMM8:
- _func = &depth_concat<uint8_t>;
- break;
- case DataType::QASYMM8_SIGNED:
- _func = &depth_concat<int8_t>;
- break;
- case DataType::F16:
- _func = &depth_concat<uint16_t>;
- break;
- case DataType::F32:
- _func = &depth_concat<uint32_t>;
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src,
- unsigned int depth_offset,
- const arm_compute::ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
- return Status{};
-}
-
-void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
- tensors.get_tensor(TensorType::ACL_DST),
- _depth_offset,
- window);
-}
-
-const char *CpuConcatenateDepthKernel::name() const
-{
- return "CpuConcatenateDepthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h b/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
deleted file mode 100644
index af89c2464f..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateDepthKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the depth concatenate kernel.
- * The input tensor will be concatenated into the output tensor.
- */
-class CpuConcatenateDepthKernel : public ICpuKernel
-{
-public:
- CpuConcatenateDepthKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] depth_offset The offset on the Z axis.
- * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
- *
- * @note: The output tensor's low two dimensions can't be smaller than the input one's.
- * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
- *
- */
- void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateDepthKernel
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] depth_offset The offset on the Z axis.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
-
-private:
- DepthConcatFunction *_func;
- unsigned int _depth_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEDEPTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
deleted file mode 100644
index 54b972662b..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
- for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
- }
-
- return Status{};
-}
-} // namespace
-
-CpuConcatenateHeightKernel::CpuConcatenateHeightKernel()
- : _height_offset(0)
-{
-}
-
-void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(src);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
-
- _height_offset = height_offset;
-
- // Configure kernel window
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
- return Status{};
-}
-
-void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- // Offset destination pointer to the correct position
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
- const int window_step_x = 16;
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
-
- // Create iterators
- Iterator src_it(src, win);
- Iterator dst_it(dst, win);
-
- const DataType dt = src->info()->data_type();
- const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
-
- },
- src_it, dst_it);
- }
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
- vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = src_it.ptr();
- const auto out_ptr = dst_ptr + dst_it.offset();
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
- }
-}
-
-const char *CpuConcatenateHeightKernel::name() const
-{
- return "CpuConcatenateHeightKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h b/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
deleted file mode 100644
index 609bb21da7..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateHeightKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the height concatenate kernel.
- * The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateHeightKernel : public ICpuKernel
-{
-public:
- CpuConcatenateHeightKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor info. Data types supported: All
- * @param[in] height_offset The starting offset on the Y axis for the output tensor.
- * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
- *
- */
- void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateHeightKernel
- *
- * @param[in] src Source tensor info. Data types supported: All
- * @param[in] height_offset The starting offset on the Y axis for the output tensor.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- unsigned int _height_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEHEIGHT_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
deleted file mode 100644
index effcbc336c..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
-
- for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
- }
-
- return Status{};
-}
-} // namespace
-
-CpuConcatenateWidthKernel::CpuConcatenateWidthKernel()
- : _width_offset(0)
-{
-}
-
-void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
- ARM_COMPUTE_UNUSED(dst);
-
- _width_offset = width_offset;
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
-
- ICpuKernel::configure(win);
-}
-
-Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
- return Status{};
-}
-
-void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- // Offset output pointer to the correct position
- uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
- constexpr int window_step_x = 16;
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator src_it(src, win);
- Iterator dst_it(dst, win);
- const DataType dt = src->info()->data_type();
- const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
- if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
- vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
- }
- },
- src_it, dst_it);
- }
- else
- {
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = src_it.ptr();
- const auto out_ptr = dst_ptr + dst_it.offset();
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(out_ptr + x) = *(in_ptr + x);
- }
- },
- src_it, dst_it);
- }
-}
-
-const char *CpuConcatenateWidthKernel::name() const
-{
- return "CpuConcatenateWidthKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h b/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
deleted file mode 100644
index afdc3ccddd..0000000000
--- a/src/core/cpu/kernels/CpuConcatenateWidthKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H
-#define ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the width concatenate kernel.
- * The source tensor will be concatenated into the destination tensor.
- */
-class CpuConcatenateWidthKernel : public ICPPKernel
-{
-public:
- CpuConcatenateWidthKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor info. Data types supported: All
- * @param[in] width_offset The offset on the X axis.
- * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src.
- */
- void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConcatenateWidthKernel
- *
- * @param[in] src Source tensor info. Data types supported: All
- * @param[in] width_offset The offset on the X axis.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p src.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- unsigned int _width_offset;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONCATENATEWIDTH_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
deleted file mode 100644
index d91ee64ecf..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-CpuConvertFullyConnectedWeightsKernel::CpuConvertFullyConnectedWeightsKernel()
- : _factor1(0), _factor2(0)
-{
-}
-
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
- DataLayout data_layout)
-
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Output tensor auto initialisation if not yet initialized
- auto_init_if_empty(*dst, *src->clone());
-
- ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
-
- const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
-
- const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
-
- const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx];
- const unsigned int num_channels = original_input_shape[channel_idx];
-
- _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels;
- _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane;
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
- DataLayout data_layout)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
- ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3));
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
-
- // Checks performed when dst is configured
- if((dst != nullptr) && (dst->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- }
-
- return Status{};
-}
-
-template <typename T>
-void CpuConvertFullyConnectedWeightsKernel::run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window)
-{
- const unsigned int dst_stride_x = out->info()->strides_in_bytes().x();
- const unsigned int dst_stride_y = out->info()->strides_in_bytes().y();
-
- Iterator input(in, window);
- Iterator output(out, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- *reinterpret_cast<T *>(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y) = *reinterpret_cast<T *>(input.ptr());
- },
- input);
-}
-
-void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(src->info()->element_size())
- {
- case 1:
- run_convert_fc_weights<uint8_t>(src, dst, window);
- break;
- case 2:
- run_convert_fc_weights<uint16_t>(src, dst, window);
- break;
- case 4:
- run_convert_fc_weights<uint32_t>(src, dst, window);
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
- }
-}
-
-const char *CpuConvertFullyConnectedWeightsKernel::name() const
-{
- return "CpuConvertFullyConnectedWeightsKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index c867e3deeb..0000000000
--- a/src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H
-#define ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- * - It follows a Convolution layer
- * - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuConvertFullyConnectedWeightsKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel);
- /** Set the src and dst tensor.
- *
- * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
- * @param[in] dst The converted weights tensor info. Shape and Data Type: Same as @p src.
- * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuConvertFullyConnectedWeightsKernel
- *
- * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
- * @param[in] dst The converted weights tensor info. Shape and Data Type: Same as @p src.
- * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
- * @param[in] data_layout The data layout the weights have been trained in.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- unsigned int _factor1; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
- unsigned int _factor2; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
-
- /** Template function to run the permute
- *
- * @param[in] in Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
- * @param[in] out The converted weights tensor info. Shape and Data Type: Same as @p in.
- * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
- */
- template <typename T>
- void run_convert_fc_weights(const ITensor *in, ITensor *out, const Window &window);
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_CONVERTFULLYCONNECTEDWEIGHTS_KERNEL_H */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuCopyKernel.cpp b/src/core/cpu/kernels/CpuCopyKernel.cpp
deleted file mode 100644
index 8ec354b2aa..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuCopyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList())
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
-
- // Validate destination if initialized
- if(dst->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
- // Destination auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, *src);
- return std::make_pair(Status{}, calculate_max_window(*dst));
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
- const TensorShape src_shape = src->tensor_shape();
- const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape));
- // Configure window
- const Window win = calculate_max_window(*dst, dst->dimension(0));
- return std::make_pair(Status{}, win);
-}
-
-} // namespace
-
-void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding));
-
- _padding = padding;
-
- std::pair<Status, Window> win_config;
- if(padding.empty())
- {
- win_config = validate_and_configure_window(src, dst);
- }
- else
- {
- win_config = validate_and_configure_window_with_padding(src, dst, padding);
- }
-
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICpuKernel::configure(win_config.second);
-}
-
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
-
- if(padding.empty())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
- }
-
- return Status{};
-}
-
-void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- if(_padding.empty())
- {
- Window dst_window{ window };
- dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
- Window out_slice = dst_window.first_slice_window_1D();
- do
- {
- Iterator src_it(src, out_slice);
- Iterator dst_it(dst, out_slice);
-
- execute_window_loop(out_slice, [&](const Coordinates &)
- {
- memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
- },
- src_it, dst_it);
- }
- while(dst_window.slide_window_slice_1D(out_slice));
- }
- else
- {
- Window src_window{ window };
- src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
-
- Iterator src_it(src, src_window);
- Iterator dst_it(dst, window);
- const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
- execute_window_loop(window, [&](const Coordinates &)
- {
- auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
- std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
- },
- src_it, dst_it);
- }
-}
-
-const char *CpuCopyKernel::name() const
-{
- return "CpuCopyKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuCopyKernel.h b/src/core/cpu/kernels/CpuCopyKernel.h
deleted file mode 100644
index 98b79a964c..0000000000
--- a/src/core/cpu/kernels/CpuCopyKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H
-#define ARM_COMPUTE_CPU_COPY_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform a copy between two tensors */
-class CpuCopyKernel : public ICpuKernel
-{
-public:
- CpuCopyKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor. Data types supported: All
- * @param[out] dst Destination tensor. Data types supported: same as @p src.
- * @param[in] padding (Optional) Padding to be applied to the input tensor
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList());
- /** Static function to check if given info will lead to a valid configuration of @ref CpuCopyKernel
- *
- * @param[in] src Source tensor. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: same as @p src.
- * @param[in] padding (Optional) Padding to be applied to the input tensor
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- PaddingList _padding{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
deleted file mode 100644
index 4ddb35f2d5..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ /dev/null
@@ -1,919 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto data_layout = DataLayout::NHWC;
-const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
-constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
-constexpr size_t vector_size = 8;
-
-struct DepthwiseConvolutionRunInfo
-{
- const size_t num_read_elements_per_iteration;
- const uint32_t x_start;
- const uint32_t x_end;
- const uint32_t x_step;
- const uint32_t x_leftover_start;
- const size_t input_stride_y;
- const size_t input_stride_z;
- const size_t input_max_offset;
- const size_t weights_width;
- const size_t weights_height;
- const size_t weights_stride_y;
- const size_t weights_stride_z;
- const size_t conv_stride_x;
- const size_t conv_stride_y;
- const size_t conv_pad_left;
- const size_t conv_pad_top;
- const size_t input_height;
- const size_t input_width;
- const size_t input_depth;
-
- DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
- : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
- x_start(w.x().start()),
- x_end(w.x().end()),
- x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
- x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
- input_stride_y(input.strides_in_bytes().y()),
- input_stride_z(input.strides_in_bytes().z()),
- input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
- weights_width(weights.dimension(width_idx)),
- weights_height(weights.dimension(height_idx)),
- weights_stride_y(weights.strides_in_bytes().y()),
- weights_stride_z(weights.strides_in_bytes().z()),
- conv_stride_x(conv_info.stride().first),
- conv_stride_y(conv_info.stride().second),
- conv_pad_left(conv_info.pad_left()),
- conv_pad_top(conv_info.pad_top()),
- input_height(input.dimension(height_idx)),
- input_width(input.dimension(width_idx)),
- input_depth(input.dimension(channel_idx))
- {
- }
-};
-
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
-{
- const int32_t current_h = base_h + h * dilation.y();
- const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
-
- const int32_t current_w = base_w + w * dilation.x();
- const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
-
- return is_valid_h && is_valid_w;
-}
-
-template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, const Window &window, bool has_biases)
-{
- constexpr auto element_per_vector = vector_size / sizeof(T);
- using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
- using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
-
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
- const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
-
- Window execution_window = window;
- execution_window.set(Window::DimX, dim_single_unit_step);
-
- Window win_input = window;
- win_input.set(Window::DimX, dim_manual_loop);
- win_input.set(Window::DimY, dim_manual_loop);
- win_input.set(Window::DimZ, dim_manual_loop);
-
- Window win_weights = win_input;
- win_weights.set(Window::DimW, dim_manual_loop);
-
- Window win_output = window;
- win_output.set(Window::DimX, dim_manual_loop);
-
- Iterator input_it(src, win_input);
- Iterator weights_it(weights, win_weights);
- Iterator output_it(dst, win_output);
- Iterator biases_it{};
-
- if(has_biases)
- {
- biases_it = Iterator(biases, win_weights);
- }
-
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto const base_weights_ptr = weights_it.ptr();
- uint32_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
- {
- VectorType acc = zero_vector;
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
-
- for(uint32_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(uint32_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- zero_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
- acc = wrapper::vmla(acc, weights_vals, input_vals);
-
- offs += dilation.x() * run_info.input_stride_y;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc = wrapper::vadd(acc, biases_vals);
- }
-
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
- }
-
- for(; x < run_info.x_end; ++x)
- {
- auto acc_scalar = T{ 0 };
- auto weights_ptr = base_weights_ptr;
- int64_t input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
- const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc_scalar += (input_vals * weights_vals);
-
- offs += dilation.x() * run_info.input_stride_y;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
- acc_scalar += biases_vals;
- }
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
- }
- },
- input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
-{
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
- Window execution_window = window;
- execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
- Window win_input = execution_window;
- win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
- win_input.set(Window::DimY, dim_manual_loop);
- win_input.set(Window::DimZ, dim_manual_loop);
-
- Window win_weights = window;
- win_weights.set_dimension_step(Window::DimX, run_info.x_step);
- win_weights.set(Window::DimY, dim_manual_loop);
- win_weights.set(Window::DimZ, dim_manual_loop);
- win_weights.set(Window::DimW, dim_manual_loop);
-
- Window win_output = window;
- win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
- Iterator input_it(src, win_input);
- Iterator weights_it(weights, win_weights);
- Iterator output_it(dst, win_output);
- Iterator biases_it{};
-
- if(has_biases)
- {
- biases_it = Iterator(biases, win_weights);
- }
-
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
- const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
-
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
- }
-
- offs += dilation.x() * run_info.input_stride_y;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- if(has_biases)
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
- }
- }
- else
- {
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
- }
- }
- },
- input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
- ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
- constexpr auto element_per_vector = vector_size / sizeof(T);
- using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
- using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
- using AccType = int32_t;
- using AccArrayType = std::array<AccType, element_per_vector>;
-
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
- const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
-
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
-
- const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
- const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
- const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
- const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
- Window execution_window = window;
- execution_window.set(Window::DimX, dim_single_unit_step);
-
- Window win_input = window;
- win_input.set(Window::DimX, dim_manual_loop);
- win_input.set(Window::DimY, dim_manual_loop);
- win_input.set(Window::DimZ, dim_manual_loop);
-
- Window win_weights = win_input;
- win_weights.set(Window::DimW, dim_manual_loop);
-
- Window win_output = window;
- win_output.set(Window::DimX, dim_manual_loop);
-
- Iterator input_it(src, win_input);
- Iterator weights_it(weights, win_weights);
- Iterator output_it(dst, win_output);
- Iterator biases_it{};
-
- if(has_biases)
- {
- biases_it = Iterator(biases, win_weights);
- }
-
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
- auto const base_weights_ptr = weights_it.ptr();
- size_t x = run_info.x_start;
-
- for(; x < run_info.x_leftover_start; x += run_info.x_step)
- {
- AccArrayType acc{};
- AccArrayType in_sum{};
- AccArrayType we_sum{};
-
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_vals = is_valid_region ?
- wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
- out_of_bound_vector;
- const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- for(size_t i = 0; i < element_per_vector; ++i)
- {
- acc.at(i) += input_vals[i] * weights_vals[i];
- in_sum.at(i) += input_vals[i];
- we_sum.at(i) += weights_vals[i];
- }
-
- offs += dilation.x() * run_info.input_stride_y;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
- for(size_t i = 0; i < element_per_vector; ++i)
- {
- acc.at(i) -= in_sum.at(i) * weights_qoffset;
- acc.at(i) -= we_sum.at(i) * input_qoffset;
- acc.at(i) += k_offset;
-
- if(has_biases)
- {
- acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
- }
-
- const int32_t out_mul = output_multiplier.at(x + i);
- const int32_t out_shift = output_shift.at(x + i);
- if(out_shift < 0)
- {
- acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
- }
- out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
- }
-
- wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
- }
-
- // left-over
- for(; x < run_info.x_end; ++x)
- {
- AccType acc = 0;
- AccType in_sum = 0;
- AccType we_sum = 0;
-
- auto weights_ptr = base_weights_ptr;
- auto input_offset = base_input_offset;
-
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int64_t offs = input_offset + x * sizeof(T);
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ?
- *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
- out_of_bound_value;
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
- acc += input_val * weights_val;
- in_sum += input_val;
- we_sum += weights_val;
-
- offs += dilation.x() * run_info.input_stride_y;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- T out_vals{ 0 };
-
- acc -= in_sum * weights_qoffset;
- acc -= we_sum * input_qoffset;
- acc += k_offset;
-
- if(has_biases)
- {
- acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
- }
-
- const int32_t out_mul = output_multiplier.at(x);
- const int32_t out_shift = output_shift.at(x);
-
- if(out_shift < 0)
- {
- acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
- }
-
- out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
- *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
- }
- },
- input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
- using AccType = int32_t;
-
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
- const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
-
- const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
- const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
- const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
- const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
-
- Window execution_window = window;
- execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
- Window win_input = execution_window;
- win_input.set(Window::DimY, dim_manual_loop);
- win_input.set(Window::DimZ, dim_manual_loop);
-
- Window win_weights = window;
- win_weights.set_dimension_step(Window::DimX, run_info.x_step);
- win_weights.set(Window::DimY, dim_manual_loop);
- win_weights.set(Window::DimZ, dim_manual_loop);
- win_weights.set(Window::DimW, dim_manual_loop);
-
- Window win_output = window;
- win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
- Iterator input_it(src, win_input);
- Iterator weights_it(weights, win_weights);
- Iterator output_it(dst, win_output);
- Iterator biases_it{};
-
- if(has_biases)
- {
- biases_it = Iterator(biases, win_weights);
- }
-
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::vector<AccType> acc(depth_multiplier, 0);
- std::vector<AccType> we_sum(depth_multiplier, 0);
- AccType in_sum = 0;
-
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
- const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- acc.at(m) += input_val * weights_val;
-
- we_sum.at(m) += weights_val;
- }
-
- offs += dilation.x() * run_info.input_stride_y;
- in_sum += input_val;
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- for(size_t m = 0; m < depth_multiplier; ++m)
- {
- acc.at(m) -= in_sum * weights_qoffset;
- acc.at(m) -= we_sum.at(m) * input_qoffset;
- acc.at(m) += k_offset;
-
- if(has_biases)
- {
- acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
- }
-
- const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
- const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
- if(out_shift < 0)
- {
- acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
- }
- else
- {
- acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
- }
- *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
- }
- },
- input_it, weights_it, biases_it, output_it);
-}
-
-template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
- const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
-{
- constexpr int half_vec = vector_size / 2;
-
- using AccType = int32_t;
- using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
- using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
- using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
-
- const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
-
- const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
- const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
- const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
-
- const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
- const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
- const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
-
- const auto out_mul = output_multiplier.at(0);
- const auto out_shift = output_shift.at(0);
-
- Window execution_window = window;
- execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
-
- Window win_input = execution_window;
- win_input.set(Window::DimY, dim_manual_loop);
- win_input.set(Window::DimZ, dim_manual_loop);
-
- Window win_weights = window;
- win_weights.set_dimension_step(Window::DimX, run_info.x_step);
- win_weights.set(Window::DimY, dim_manual_loop);
- win_weights.set(Window::DimZ, dim_manual_loop);
- win_weights.set(Window::DimW, dim_manual_loop);
-
- Window win_output = window;
- win_output.set_dimension_step(Window::DimX, run_info.x_step);
-
- Iterator input_it(src, win_input);
- Iterator weights_it(weights, win_weights);
- Iterator output_it(dst, win_output);
- Iterator biases_it{};
-
- if(has_biases)
- {
- biases_it = Iterator(biases, win_weights);
- }
-
- std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
- std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
-
- execute_window_loop(execution_window, [&](const Coordinates & id)
- {
- std::fill(begin(acc0), end(acc0), zero);
- std::fill(begin(acc1), end(acc1), zero);
-
- const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
- const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
- int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
- auto weights_ptr = weights_it.ptr();
- for(size_t h = 0; h < run_info.weights_height; ++h)
- {
- const int32_t current_h = input_z + h * dilation.y();
- if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
- {
- int offs = input_offset;
- for(size_t w = 0; w < run_info.weights_width; ++w)
- {
- const int32_t current_w = input_y + w * dilation.x();
- if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
- {
- const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
- const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
- const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
- {
- const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
- const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
- const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
-
- acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
- acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
- }
- }
-
- offs += dilation.x() * run_info.input_stride_y;
- }
- }
-
- weights_ptr += run_info.weights_stride_z;
- input_offset += dilation.y() * run_info.input_stride_z;
- }
-
- for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
- {
- if(has_biases)
- {
- const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
- const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
- acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
- acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
- }
-
- if(out_shift < 0)
- {
- acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
- }
- else
- {
- acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
- acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
- }
-
- acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
- acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
- const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
- wrapper::vmovn(acc1.at(i)));
-
- if(std::is_same<T, uint8_t>::value)
- {
- wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
- }
- else
- {
- wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
- }
- }
- },
- input_it, weights_it, biases_it, output_it);
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
- ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
- ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
- ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
-
- if(is_data_type_quantized_per_channel(weights->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
- }
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
-
- if(is_data_type_quantized_asymmetric(src->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- }
- }
-
- if(dst->total_size() != 0)
- {
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- return Status{};
-}
-} // namespace
-
-CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
- : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
-{
-}
-
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
-
- _conv_info = info.pad_stride_info;
- _depth_multiplier = info.depth_multiplier;
- _dilation = info.dilation;
- _has_biases = (biases != nullptr);
-
- if(is_data_type_quantized(src->data_type()))
- {
- const auto input_scale = src->quantization_info().uniform().scale;
- const auto output_scale = dst->quantization_info().uniform().scale;
-
- auto weights_scale = weights->quantization_info().scale();
- if(!is_data_type_quantized_per_channel(weights->data_type()))
- {
- for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
- {
- weights_scale.push_back(weights_scale.front());
- }
- }
-
- for(const auto &s : weights_scale)
- {
- int32_t out_mult = 0;
- int32_t out_shift = 0;
- const float multiplier = input_scale * s / output_scale;
- arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
-
- _output_multiplier.push_back(out_mult);
- _output_shift.push_back(out_shift);
- }
- }
-
- switch(weights->data_type())
- {
- case DataType::QASYMM8:
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
- break;
- case DataType::QASYMM8_SIGNED:
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
- break;
- case DataType::QSYMM8_PER_CHANNEL:
- if(src->data_type() == DataType::QASYMM8)
- {
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
- }
- else
- {
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
- }
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
- break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F32:
- _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
-
- const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
- auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
-
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
-}
-
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
- return Status{};
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- if(_depth_multiplier == 1)
- {
- depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
- }
- else
- {
- depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
- }
-}
-
-template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
- ITensor *dst, const Window &window, bool has_biases)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- if(_depth_multiplier == 1)
- {
- depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
- }
- else
- {
- const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
- const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
-
- if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
- {
- depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
- }
- else
- {
- depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
- }
- }
-}
-
-void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- (this->*_func)(src, weights, biases, dst, window, _has_biases);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
deleted file mode 100644
index 559c46dc93..0000000000
--- a/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
-
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-#include "support/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
-{
-public:
- const char *name() const override
- {
- return "CpuDepthwiseConv2dNativeKernel";
- }
- /** Default constructor */
- CpuDepthwiseConv2dNativeKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
-
- /** Initialize the function's source, destination and parameters.
- *
- * @note Supported data layouts: NHWC
- *
- * @param[in] src Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
- * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
- * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
- * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
- * @param[out] dst Destination tensor. Data type supported: Same as @p src.
- * @param[in] info Depthwise convolution meta-data.
- *
- */
- void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDepthwiseConv2dNativeKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-private:
- template <typename T>
- using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type;
-
- template <typename T, typename TW, FloatEnalber<T> = 0>
- void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
- template <typename T>
- using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type;
-
- template <typename T, typename TW, Quantized8bitEnalber<T> = 0>
- void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
- /** Common signature for all the specialised depthwise convolution native functions
- *
- * @param[in] window Region on which to execute the kernel.
- */
- using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
-
- DepthwiseFunctionPtr _func;
- PadStrideInfo _conv_info;
- unsigned int _depth_multiplier;
- Size2D _dilation;
- std::vector<int> _output_multiplier;
- std::vector<int> _output_shift;
- bool _has_biases;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.cpp b/src/core/cpu/kernels/CpuDequantizeKernel.cpp
deleted file mode 100644
index 42b5439697..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDequantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
-
- if(dst->tensor_shape().total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- }
-
- return Status{};
-}
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x4_t &v)
-{
- ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x4_t &v)
-{
- wrapper::vstore(ptr, v.val[0]);
- wrapper::vstore(ptr + 4, v.val[1]);
- wrapper::vstore(ptr + 8, v.val[2]);
- wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v)
-{
- wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
- wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline void store_result(T *ptr, const float32x4x2_t &v)
-{
- ARM_COMPUTE_UNUSED(ptr, v);
-}
-
-template <>
-inline void store_result<float>(float *ptr, const float32x4x2_t &v)
-{
- wrapper::vstore(ptr, v.val[0]);
- wrapper::vstore(ptr + 4, v.val[1]);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v)
-{
- wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename TOut, typename TIn>
-void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
- const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
- const float scale = qinfo.scale;
- const int32_t offset = qinfo.offset;
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr());
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale, offset);
-
- store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
- }
- },
- in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window)
-{
- const auto scale = input->info()->quantization_info().scale();
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Reset first dimension to handle tail calculations manually
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win);
- Iterator out(output, win);
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale[id.z()]);
-
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
- }
- },
- in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window)
-{
- const auto scale = input->info()->quantization_info().scale();
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Reset first dimension to handle tail calculations manually
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win);
- Iterator out(output, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t vscale =
- {
- {
- scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
- scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
- scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
- scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
- }
- };
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, vscale);
-
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
- }
- },
- in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
-{
- const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
- const float scale = qinfo.scale;
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize(vin, scale);
-
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int8_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
- }
- },
- in, out);
-}
-
-template <typename T>
-void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window)
-{
- const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
- const float scale = qinfo.scale;
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create iterators
- Iterator in(input, win_collapsed);
- Iterator out(output, win_collapsed);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr());
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(in_ptr + x);
- const auto vdeq = vdequantize_int16(vin, scale);
-
- store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int16_t val = *(in_ptr + x);
- *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
- }
- },
- in, out);
-}
-
-template <typename T>
-void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
-{
- switch(input->info()->data_type())
- {
- case DataType::QASYMM8:
- run_dequantization_qasymm8<T, uint8_t>(input, output, window);
- break;
- case DataType::QASYMM8_SIGNED:
- run_dequantization_qasymm8<T, int8_t>(input, output, window);
- break;
- case DataType::QSYMM8_PER_CHANNEL:
- input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
- break;
- case DataType::QSYMM8:
- run_dequantization_qsymm8<T>(input, output, window);
- break;
- case DataType::QSYMM16:
- run_dequantization_qsymm16<T>(input, output, window);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-}
-} // namespace
-
-void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
-
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
-
- ICpuKernel::configure(win);
-}
-
-Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
- return Status{};
-}
-
-void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(dst->info()->data_type())
- {
- case DataType::F32:
- run_dequantization_core<float>(src, dst, window);
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- run_dequantization_core<float16_t>(src, dst, window);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Unsupported data type.");
- }
-}
-const char *CpuDequantizeKernel::name() const
-{
- return "CpuDequantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDequantizeKernel.h b/src/core/cpu/kernels/CpuDequantizeKernel.h
deleted file mode 100644
index 798f32cec7..0000000000
--- a/src/core/cpu/kernels/CpuDequantizeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the dequantization layer kernel. */
-class CpuDequantizeKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuDequantizeKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel);
- /** Set input, output tensors.
- *
- * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
- * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuDequantizeKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
deleted file mode 100644
index c0fc41525e..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ /dev/null
@@ -1,1385 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-
-using namespace arm_compute::detail;
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <unsigned int stridex>
-float16x8_t internal_vld1q(const float16_t *in);
-
-template <>
-float16x8_t internal_vld1q<1>(const float16_t *in)
-{
- return vld1q_f16(in);
-}
-
-template <>
-float16x8_t internal_vld1q<2>(const float16_t *in)
-{
- const float16x8x2_t tmp = vld2q_f16(in);
- return tmp.val[0];
-}
-
-template <>
-float16x8_t internal_vld1q<3>(const float16_t *in)
-{
- const float16x8x3_t tmp = vld3q_f16(in);
- return tmp.val[0];
-}
-
-inline float16x8_t internal_vdupq_n(float16_t v)
-{
- return vdupq_n_f16(v);
-}
-
-inline void internal_vst1q(float16_t *p, const float16x8_t &v)
-{
- vst1q_f16(p, v);
-}
-
-float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
-{
- return vmulq_f16(x, y);
-}
-
-inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
-{
- return vaddq_f16(x, vmulq_f16(y, z));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <unsigned int stridex>
-float32x4_t internal_vld1q(const float *in);
-
-template <>
-float32x4_t internal_vld1q<1>(const float *in)
-{
- return vld1q_f32(in);
-}
-
-template <>
-float32x4_t internal_vld1q<2>(const float *in)
-{
- const float32x4x2_t tmp = vld2q_f32(in);
- return tmp.val[0];
-}
-
-template <>
-float32x4_t internal_vld1q<3>(const float *in)
-{
- const float32x4x3_t tmp = vld3q_f32(in);
- return tmp.val[0];
-}
-
-inline float32x4_t internal_vdupq_n(float v)
-{
- return vdupq_n_f32(v);
-}
-
-inline void internal_vst1q(float *p, const float32x4_t &v)
-{
- vst1q_f32(p, v);
-}
-
-float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
-{
- return vmulq_f32(x, y);
-}
-
-inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
-{
- return vmlaq_f32(x, y, z);
-}
-
-constexpr int small_tensor_size_optim = 8;
-inline bool run_optim_small_tensor_info(const ITensorInfo *t)
-{
- return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
-}
-
-inline bool run_optim_small_tensor(const ITensor *t)
-{
- return run_optim_small_tensor_info(t->info());
-}
-
-// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
-// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
-// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
-template <unsigned int stridex>
-class convolver_w1x1_i8x8_f32
-{
-public:
- static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
- {
- ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
- ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
-
- const int input_stride_x = src->info()->strides_in_bytes().x();
- const int input_stride_y = src->info()->strides_in_bytes().y();
- const int input_stride_z = src->info()->strides_in_bytes().z();
- const int output_stride_y = dst->info()->strides_in_bytes().y();
- const int output_stride_z = dst->info()->strides_in_bytes().z();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_h = dst->info()->dimension(1);
- const int range_z = window.z().end() - window.z().start();
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
-
- // setup output window for the iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
- window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
- // setup input window for the iterator
- Window window_in = window;
- // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
- window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(dst, window_out);
- Iterator in(src, window_in);
- Iterator k(weights, window_k);
-
- const uint8_t *k_ptr = k.ptr();
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
- uint8_t *out_ptr = out.ptr();
- int ih = 0;
- int oh = 0;
- std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
- std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
- for(int oz = 0; oz < range_z; ++oz)
- {
- accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
- accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
- auto p_out_base = out_ptr + oz * output_stride_z;
- for(int p = 0; p < kernel_depth; ++p)
- {
- const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
- const auto vk0 = internal_vdupq_n(*k_val);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- const int offset_xy = ih * input_stride_y;
- auto in_val = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
- auto v_in0 = internal_vld1q<stridex>(in_val);
- auto v_in1 = internal_vld1q<stridex>(in_val + 4);
- accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
- accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
- }
- }
- for(oh = 0; oh < output_h; ++oh)
- {
- auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
- vst1q_f32(p_out, accum0[oh]);
- vst1q_f32(p_out + 4, accum1[oh]);
- }
- }
- },
- in, out);
- }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_1x1
-{
-public:
- static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
- {
- const int input_stride_x = src->info()->strides_in_bytes().x();
- const int input_stride_y = src->info()->strides_in_bytes().y();
- const int input_stride_z = src->info()->strides_in_bytes().z();
- const int output_stride_y = dst->info()->strides_in_bytes().y();
- const int output_stride_z = dst->info()->strides_in_bytes().z();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = dst->info()->dimension(0);
- const int output_h = dst->info()->dimension(1);
- const int range_z = window.z().end() - window.z().start();
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
-
- // setup output window for the iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
- window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
-
- // setup input window for the iterator
- Window window_in = window;
- // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
- window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
- Iterator out(dst, window_out);
- Iterator in(src, window_in);
- Iterator k(weights, window_k);
-
- const uint8_t *k_ptr = k.ptr();
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- /*
- For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
- */
- const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
- uint8_t *out_ptr = out.ptr();
- int ih = 0;
- int oh = 0;
- for(int oz = 0; oz < range_z; ++oz)
- {
- auto p_out_base = out_ptr + oz * output_stride_z;
- // Step 1
- {
- const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
- const auto vk = internal_vdupq_n(*k_val);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- const int offset_xy = ih * input_stride_y;
- auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
- {
- internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
- }
- }
- }
-
- // Step 2
- for(int p = 1; p < kernel_depth; ++p)
- {
- const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
- const auto vk = internal_vdupq_n(*k_val);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- const int offset_xy = ih * input_stride_y;
- auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
- {
- internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
- }
- }
- }
- }
- },
- in, out);
- }
-};
-
-template <unsigned int stridex>
-float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
-
-inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
-{
- const float32x4x3_t m00 =
- {
- {
- vld1q_dup_f32(m0),
- vld1q_dup_f32(m1),
- vld1q_dup_f32(m2)
- }
- };
- return m00;
-}
-
-inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
-{
- const float32x4x2_t m00 =
- {
- {
- vld1q_dup_f32(m3),
- vld1q_dup_f32(m4)
- }
- };
- return m00;
-}
-
-inline float32x4x3_t load_input(const float *const in)
-{
- const float32x4x3_t vin =
- {
- {
- vld1q_f32(in),
- vld1q_f32(in + 4),
- vld1q_f32(in + 8)
- }
- };
- return vin;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
- const float32x4x3_t vin0 = load_input(in_0);
- const float32x4x3_t vin1 = load_input(in_1);
- const float32x4x3_t vin2 = load_input(in_2);
- const float32x4x3_t vin3 = load_input(in_3);
- const float32x4x3_t vin4 = load_input(in_4);
- const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
- const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
- const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
- const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
- const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
- const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
- const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
- const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
- const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
- const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
-
- float32x4x2_t out =
- {
- {
- vmulq_f32(vin0.val[0], m00.val[0]),
- vmulq_f32(vin0.val[1], m00.val[0])
- }
- };
-
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
-
- out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
-
- out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
-
- out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
-
- out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
- out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
-
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
-
- out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
-
- out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
-
- out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
-
- out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
- out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
- out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
-
- return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
- float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
- out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
- out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
- out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
- return out;
-}
-
-template <>
-inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
- const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
-{
- float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
- out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
- return out;
-}
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_3x3
-{
-public:
- static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
- {
- ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = src->info()->strides_in_bytes().x();
- const int input_stride_y = src->info()->strides_in_bytes().y();
- const int input_stride_z = src->info()->strides_in_bytes().z();
- const int output_stride_y = dst->info()->strides_in_bytes().y();
- const int output_stride_z = dst->info()->strides_in_bytes().z();
- const int kernel_stride_x = weights->info()->strides_in_bytes().x();
- const int kernel_stride_y = weights->info()->strides_in_bytes().y();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = dst->info()->dimension(0);
- const int output_h = dst->info()->dimension(1);
- const int num_planes_z = window.z().end() - window.z().start();
- const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
-
- // setup output window for the iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
- window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
- // setup input window for the iterator
- Window window_in = window;
- // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
- window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
- Iterator out(dst, window_out);
- Iterator in(src, window_in);
- Iterator k(weights, window_k);
-
- const uint8_t *k_ptr = k.ptr();
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
- uint8_t *out_ptr = out.ptr();
- int ih = 0;
- int oh = 0;
- /*
- Each thread executing this kernel computes one or more output's volume planes.
-
- Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
- the third thread [16,24] and the fourth thread [25,31].
-
- The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
- is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
-
- The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
- 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
- 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
- */
- for(int oz = 0; oz < num_planes_z; ++oz)
- {
- const int zoffset = id.z() + oz;
- uint8_t *p_out_base = out_ptr + oz * output_stride_z;
- // Step 1
- {
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
- const auto vk_r0 = load_matrix_row(ptr_k_r0);
- const auto vk_r1 = load_matrix_row(ptr_k_r1);
- const auto vk_r2 = load_matrix_row(ptr_k_r2);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
- auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
- auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
- in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
- {
- convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
- }
- }
- }
- // Step 2
- for(int p = 1; p < kernel_depth; ++p)
- {
- const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
- const uint8_t *input_base = input_ptr + p * input_stride_z;
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
- const auto vk_r0 = load_matrix_row(ptr_k_r0);
- const auto vk_r1 = load_matrix_row(ptr_k_r1);
- const auto vk_r2 = load_matrix_row(ptr_k_r2);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
- auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
- auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
- in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
- {
- convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
- }
- }
- }
- }
- },
- in, out);
- }
-};
-
-template <typename T1, typename T2, unsigned int stridex>
-class convolver_5x5
-{
-public:
- static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
- {
- ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
- const int input_stride_x = src->info()->strides_in_bytes().x();
- const int input_stride_y = src->info()->strides_in_bytes().y();
- const int input_stride_z = src->info()->strides_in_bytes().z();
- const int output_stride_y = dst->info()->strides_in_bytes().y();
- const int output_stride_z = dst->info()->strides_in_bytes().z();
- const int kernel_stride_x = weights->info()->strides_in_bytes().x();
- const int kernel_stride_y = weights->info()->strides_in_bytes().y();
- const int kernel_stride_z = weights->info()->strides_in_bytes().z();
- const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
- const int output_w = dst->info()->dimension(0);
- const int output_h = dst->info()->dimension(1);
- const int num_planes_z = window.z().end() - window.z().start();
- const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
- const int kernel_depth = weights->info()->dimension(Window::DimZ);
- const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
-
- // setup output window for the iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
- window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
- window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
-
- // setup input window for the iterator
- Window window_in = window;
- // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
- window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Window window_k = calculate_max_window(*weights->info(), Steps(1u));
-
- Iterator out(dst, window_out);
- Iterator in(src, window_in);
- Iterator k(weights, window_k);
-
- const uint8_t *k_ptr = k.ptr();
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
- uint8_t *out_ptr = out.ptr();
- int ih = 0;
- int oh = 0;
- for(int oz = 0; oz < num_planes_z; ++oz)
- {
- const int zoffset = id.z() + oz;
- uint8_t *p_out_base = out_ptr + oz * output_stride_z;
- // Step 1
- {
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
- auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
- auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
- auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
- auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
- in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
- {
- auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
- store_results<stridex>(p_out, vres);
- }
- }
- }
- // Step 2
- for(int p = 1; p < kernel_depth; ++p)
- {
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
-
- for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
- {
- auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
- auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
- auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
- auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
- auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
- auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
- for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
- in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
- {
- auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
- accumulate_results<stridex>(p_out, vres);
- }
- }
- }
- }
- },
- in, out);
- }
-};
-
-float vreduce(const float32x4_t &v)
-{
- auto v0 = wrapper::vgethigh(v);
- auto v1 = wrapper::vgetlow(v);
- auto v_out = wrapper::vadd(v0, v1);
-
- float a = wrapper::vgetlane(v_out, 0);
- float b = wrapper::vgetlane(v_out, 1);
- return a + b;
-}
-
-template <typename T1, typename T2>
-inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
- const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- switch(conv_stride_x)
- {
- case 1:
- convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 2:
- convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 3:
- convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-template <>
-inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
- const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- if(run_optim_small_tensor(src))
- {
- switch(conv_stride_x)
- {
- case 1:
- convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
- break;
- case 2:
- convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
- break;
- case 3:
- convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
- }
- else
- {
- switch(conv_stride_x)
- {
- case 1:
- convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 2:
- convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 3:
- convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
- }
-}
-
-template <typename T1, typename T2>
-inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
- const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- switch(conv_stride_x)
- {
- case 1:
- convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 2:
- convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 3:
- convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-template <typename T1, typename T2>
-inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
- const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
-{
- const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
- switch(conv_stride_x)
- {
- case 1:
- convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 2:
- convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- case 3:
- convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-
- const DataLayout data_layout = src->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
-
- // Checks performed when output is configured
- if(dst->total_size() != 0)
- {
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
- DataType data_type = src->data_type();
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
- ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
- unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
-{
- ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-
- const DataLayout data_layout = src->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-
- // Calculate right and bottom border
- unsigned int kernel_size = weights->dimension(width_idx);
- const int conv_stride_x = std::get<0>(conv_info.stride());
- const int conv_stride_y = std::get<1>(conv_info.stride());
- const int input_width = src->dimension(width_idx);
-
- Window win{};
- bool window_changed = false;
-
- if(data_layout == DataLayout::NCHW)
- {
- switch(kernel_size)
- {
- case 1:
- {
- switch(src->data_type())
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- num_elems_written_per_iteration = 8;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- if(run_optim_small_tensor_info(src))
- {
- num_elems_written_per_iteration = 8;
- }
- else
- {
- num_elems_written_per_iteration = 4;
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
- }
- num_weight_elems_read_per_row = kernel_size;
- num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
- break;
- }
- case 3:
- switch(src->data_type())
- {
- case DataType::F32:
- num_weight_elems_read_per_row = 4 + kernel_size - 1;
- num_elems_read_per_iteration = 12;
- num_elems_written_per_iteration = 16 >> conv_stride_x;
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- num_weight_elems_read_per_row = 8 + kernel_size - 1;
- num_elems_read_per_iteration = 24;
- num_elems_written_per_iteration = 32 >> conv_stride_x;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
- }
- break;
- case 5:
- {
- switch(src->data_type())
- {
- case DataType::F32:
- num_weight_elems_read_per_row = 4 + kernel_size - 1;
- num_elems_read_per_iteration = 12;
- num_elems_written_per_iteration = 16 >> conv_stride_x;
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported.");
- break;
- }
- }
- break;
- default:
- {
- ARM_COMPUTE_ERROR("Not implemented");
- break;
- }
- }
-
- // Calculate right pad
- int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
- int end_x = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
- int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
-
- // Calculate border
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
- const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-
- border_size.left = conv_pad_left;
- border_size.top = conv_pad_top;
- border_size.right = conv_pad_right;
- border_size.bottom = conv_pad_bottom;
-
- // Configure window
- win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
-
- AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
- num_elems_read_per_iteration, kernel_size,
- conv_stride_x, conv_stride_y);
- AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
- AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
- window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
- output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
- }
- else
- {
- // Configure window NHWC without any padding
- win = calculate_max_window(*dst, Steps());
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
-bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
-{
- return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
-}
-
-} // namespace
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
- // This function assumes that input and weights have not padding in channel
-
- // Declare useful types
- using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
- using vector_type = typename vtype::type;
- using tag_type = typename vtype::tag_type;
-
- // Scalar quantities
- const int element_size = src->info()->element_size();
- const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = src->info()->dimension(1);
- const int input_dim_h = src->info()->dimension(2);
-
- const int output_stride_c = dst->info()->strides_in_bytes().x();
-
- const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = weights->info()->dimension(1);
- const int kernel_dim_h = weights->info()->dimension(2);
-
- const int conv_pad_top = _conv_info.pad_top();
- const int conv_pad_left = _conv_info.pad_left();
- const int conv_stride_w = std::get<0>(_conv_info.stride());
- const int conv_stride_h = std::get<1>(_conv_info.stride());
-
- // Setup input window for the output iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Setup input window for the weights iterator
- Window window_w = calculate_max_window(*weights->info(), Steps());
- window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- Iterator out(dst, window_out);
- Iterator wei(weights, window_w);
-
- constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
- /*
- * This implementation parallelize the full WC plane of input and weights by
- * treating them as series of elements. So for example, a 3x3 weights and
- * floating point vector operations of 4 elements per time, the first 3
- * channel elements of the first row would be taken and additionally the first
- * element of the second row. The 9 elements in each single WC weight plane
- * would require 2 4-element vector operations and a last single element operation.
- *
- * This works since when we create the input vector to multiply with the weights,
- * the exact required elements are loaded in the same order. Therefore the
- * multiplication works on the correct input/weight elements.
- */
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- /*
- * In here we create theoretical indexes which then we validate for both
- * inputs and weights.
- * As a reminder, this loop take each output point in NHW, C is treated
- * in the weights loop.
- */
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
- const int index_h_start = in_h_start - in_h_start_t;
- const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
- const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- /*
- * This is the loop in the weights, and it goes along N (the batches)
- * As a reminder, the batches of the weights are translated into the
- * channels of the output
- */
- const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
- + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
- const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
- {
- const T *in_ptr_mover = in_ptr_row;
- int index_wc = index_wc_start;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_row + index_wc);
- out_temp += src_val * w_val;
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
- },
- wei);
- },
- out);
-}
-
-template <typename T>
-void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
-{
- // Declare useful types
- using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
- using vector_type = typename vtype::type;
- using tag_type = typename vtype::tag_type;
-
- // Scalar quantities
- const int element_size = src->info()->element_size();
- const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
- const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
- const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
- const int input_dim_w = src->info()->dimension(1);
- const int input_dim_h = src->info()->dimension(2);
-
- const int output_stride_c = dst->info()->strides_in_bytes().x();
-
- const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
- const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
- const int kernel_dim_w = weights->info()->dimension(1);
- const int kernel_dim_h = weights->info()->dimension(2);
-
- const int conv_pad_top = _conv_info.pad_top();
- const int conv_pad_left = _conv_info.pad_left();
- const int conv_stride_w = std::get<0>(_conv_info.stride());
- const int conv_stride_h = std::get<1>(_conv_info.stride());
-
- // Setup input window for the output iterator
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Setup input window for the weights iterator
- Window window_w = calculate_max_window(*weights->info(), Steps());
- window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
- window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
- Iterator out(dst, window_out);
- Iterator wei(weights, window_w);
-
- constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- // We are computing the theoretical starting input starting points
- const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
- const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
- const int in_w_end_t = in_w_start_t + kernel_dim_w;
- const int in_h_end_t = in_h_start_t + kernel_dim_h;
-
- // We are computing the valid initial and ending input points by checking the borders
- const int in_w_start = std::max(in_w_start_t, 0);
- const int in_h_start = std::max(in_h_start_t, 0);
- const int in_w_end = std::min(in_w_end_t, input_dim_w);
- const int in_h_end = std::min(in_h_end_t, input_dim_h);
-
- // We use the input points to select the valid weight points to use
- const int wei_w_start = in_w_start - in_w_start_t;
- const int wei_h_start = in_h_start - in_h_start_t;
- const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
- const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
-
- const int index_c_end = weights->info()->dimension(0);
- const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
- execute_window_loop(window_w, [&](const Coordinates & id_w)
- {
- const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
- uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
-
- T out_temp = static_cast<T>(0);
- for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
- {
- const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
- const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
- for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
- {
- const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
- const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
- int index_c = 0;
- vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
- for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
- {
- const auto src_vec = wrapper::vloadq(in_ptr_mover);
- const auto w_vec = wrapper::vloadq(weights_ptr_mover);
- out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
- }
- out_temp += vreduce(out_temp_vec);
- for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
- {
- const auto src_val = *(in_ptr_mover);
- const auto w_val = *(weights_ptr_mover);
- out_temp += src_val * w_val;
- }
- }
- }
- *(reinterpret_cast<T *>(out_ptr)) = out_temp;
- },
- wei);
- },
- out);
-}
-
-BorderSize CpuDirectConv2dKernel::border_size() const
-{
- return _border_size;
-}
-
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-
- _conv_info = conv_info;
- _data_layout = src->data_layout();
- _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
- const unsigned int conv_pad_left = conv_info.pad_left();
- const unsigned int conv_pad_top = conv_info.pad_top();
- const unsigned int conv_pad_right = conv_info.pad_right();
- const unsigned int conv_pad_bottom = conv_info.pad_bottom();
- if(_data_layout == DataLayout::NCHW)
- {
- _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
- }
- else
- {
- _border_size = BorderSize(0);
- }
-
- // Get convolved dimensions
- TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-
- DataType data_type = src->data_type();
-
- // Output auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, output_shape, 1, data_type);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
- _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICpuKernel::configure(win_config.second);
-}
-
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
-{
- unsigned int num_weight_elems_read_per_row = 0;
- unsigned int num_elems_read_per_iteration = 0;
- unsigned int num_elems_written_per_iteration = 0;
- BorderSize border_size = {};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
- weights->clone().get(),
- dst->clone().get(),
- conv_info,
- num_weight_elems_read_per_row,
- num_elems_read_per_iteration,
- num_elems_written_per_iteration,
- border_size)
- .first);
-
- return Status{};
-}
-
-void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
-
- if(_data_layout == DataLayout::NCHW)
- {
- switch(kernel_size)
- {
- case 1:
- {
- switch(src->info()->data_type())
- {
- case DataType::F32:
- convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- break;
- }
- case 3:
- {
- switch(src->info()->data_type())
- {
- case DataType::F32:
- convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- break;
- }
- case 5:
- {
- switch(src->info()->data_type())
- {
- case DataType::F32:
- convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
- break;
- }
- }
- }
- else
- {
- switch(src->info()->data_type())
- {
- case DataType::F32:
- {
- if(have_zero_x_internal_padding(src->info(), weights->info()))
- {
- convolve_nhwc_optimized<float>(window, src, weights, dst);
- }
- else
- {
- convolve_nhwc<float>(window, src, weights, dst);
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Data type not supported");
- break;
- }
- }
-}
-const char *CpuDirectConv2dKernel::name() const
-{
- return "CpuDirectConvolutionLayerKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
deleted file mode 100644
index 62ed96f255..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConv2dKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuDirectConv2dKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
- /** Set the src, weights, and dst tensors.
- *
- * @note: DirectConvolution only works in the following configurations:
- * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
- * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
- *
- * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
- * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * The 3rd dimension must be the same as the input's volume 3rd dimension.
- * Data type supported:Same as @p input.
- * @param[out] dst Output tensor.
- * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
- * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
- */
- void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDirectConv2dKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
- BorderSize border_size() const override;
-
-private:
- /* Template function for optimized convolution NHWC */
- template <typename T>
- void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
- /* Template function for convolution NHWC */
- template <typename T>
- void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst);
-
- PadStrideInfo _conv_info{};
- BorderSize _border_size{};
- unsigned int _kernel_size{ 0 };
- unsigned int _num_weight_elems_read_per_row{ 0 };
- unsigned int _num_elems_read_per_iteration{ 0 };
- unsigned int _num_elems_written_per_iteration{ 0 };
- DataLayout _data_layout{ DataLayout::UNKNOWN };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
deleted file mode 100644
index 662d052941..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
-
- if(bias != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
- ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
- ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
- }
-
- if(src->data_type() == DataType::S32)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
- }
-
- // Checks performed when output is configured
- if((dst != nullptr) && (dst->total_size() != 0))
- {
- if(is_data_type_float(src->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- }
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- }
- else if(src->data_type() == DataType::S32)
- {
- // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
- ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
- }
-
- return Status{};
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
- const bool has_bias = bias != nullptr;
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
- ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
- ARM_COMPUTE_UNUSED(result_shift);
- ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 16 / src->info()->element_size();
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, win);
- Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
- auto v_in = wrapper::vloadq(in_ptr);
-
- // Accumulate bias
- if(has_bias)
- {
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
- v_in = wrapper::vadd(v_in, vb);
- }
-
- const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, v_in);
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
- // Accumulate bias
- if(has_bias)
- {
- const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
- s_in += b;
- }
-
- *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
- }
-
- },
- in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
- const bool has_bias = bias != nullptr;
- ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
- ARM_COMPUTE_UNUSED(result_shift);
- ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
- Window window_bias = window;
- window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
- window_bias.set(3, Window::Dimension(0, 0, 0));
-
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 16 / src->info()->element_size();
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, win);
- Iterator bi(bias, window_bias);
- Iterator out(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
- auto v_in = wrapper::vloadq(in_ptr + x);
-
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
- v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
- }
-
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- wrapper::vstore(out_ptr + x, v_in);
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
- s_in += *bias_ptr;
- }
-
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- *(out_ptr + x) = s_in;
- }
- },
- in, bi, out);
-}
-
-// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
- const bool has_bias = bias != nullptr;
- using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
- using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
- const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
- const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
- const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 16 / src->info()->element_size();
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, win);
- Iterator out(dst, win);
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32x4x4_t v_in =
- {
- {
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12)
- }
- };
-
- // Accumulate bias
- if(has_bias)
- {
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
- v_in =
- {
- {
- wrapper::vadd(v_in.val[0], vb),
- wrapper::vadd(v_in.val[1], vb),
- wrapper::vadd(v_in.val[2], vb),
- wrapper::vadd(v_in.val[3], vb)
- }
- };
- }
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
- min, max, false));
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
- // Accumulate bias
- if(has_bias)
- {
- const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
- s_in += b;
- }
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
- std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
- }
- },
- in, out);
-}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
- const bool has_bias = bias != nullptr;
- using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
- using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
- const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
- const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
- const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
- Window window_bias = window;
- window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
- window_bias.set(3, Window::Dimension(0, 0, 0));
-
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 16 / src->info()->element_size();
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, win);
- Iterator bi(bias, window_bias);
- Iterator out(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32x4x4_t v_in =
- {
- {
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12),
- }
- };
-
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
- wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
- wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
- wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
- wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
- }
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
- }
-
- // Left-overs loop
- for(; x < window_end_x; ++x)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
- int32_t s_in = *in_ptr;
-
- // Accumulate bias
- if(has_bias)
- {
- const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
- s_in += *bias_ptr;
- }
-
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
- *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
- std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
- }
- },
- in, bi, out);
-}
-} // namespace
-
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(bias);
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(src);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
- _func = nullptr;
- _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
- _result_shift = info.result_shift;
- _result_offset_after_shift = info.result_offset_after_shift;
-
- // Auto-initialize output output if required
- if(dst != nullptr)
- {
- // Work out expected output data type
- const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
- }
-
- Window win = calculate_max_window(*src, Steps());
-
- ICpuKernel::configure(win);
-
- const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
-
- // Set appropriate function
- if(src->data_layout() == DataLayout::NCHW)
- {
- switch(src->data_type())
- {
- case DataType::S32:
- {
- if(is_qasymm8_signed)
- {
- _func = &output_stage_nchw<int8_t>;
- }
- else
- {
- _func = &output_stage_nchw<uint8_t>;
- }
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- _func = &output_stage_nchw<float16_t>;
- break;
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- {
- _func = &output_stage_nchw<float>;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
- }
- }
- }
- else
- {
- switch(src->data_type())
- {
- case DataType::S32:
- {
- if(is_qasymm8_signed)
- {
- _func = &output_stage_nhwc<int8_t>;
- }
- else
- {
- _func = &output_stage_nhwc<uint8_t>;
- }
- break;
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- {
- _func = &output_stage_nhwc<float16_t>;
- break;
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- {
- _func = &output_stage_nhwc<float>;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
- }
- }
- }
-}
-
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
- return Status{};
-}
-
-void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- auto src = tensors.get_tensor(TensorType::ACL_SRC_0);
- auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
-}
-
-const char *CpuDirectConv2dOutputStageKernel::name() const
-{
- return "CpuDirectConv2dOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
deleted file mode 100644
index 62bc5d41c9..0000000000
--- a/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
- * of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class CpuDirectConv2dOutputStageKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuDirectConv2dOutputStageKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
- /** Set the accumulate buffer and the biases of the kernel.
- *
- * @param[in, out] src Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
- * Data type supported: F16/F32/S32
- * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
- * @param[out] dst (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
- * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
- * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
- * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
- */
- void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuDirectConv2dOutputStageKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
- const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
- OutputStageKernel *_func{ nullptr };
- int _result_fixedpoint_multiplier{ 0 };
- int _result_shift{ 0 };
- int _result_offset_after_shift{ 0 };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
deleted file mode 100644
index 643a870540..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using ElementwiseSelector = std::add_pointer<bool(DataType)>::type;
-using UKernelType = CpuElementwiseKernel::ElementwiseFunction;
-struct ElementwiseKernel
-{
- const char *name;
- const ElementwiseSelector is_selected;
- UKernelType *ukernel;
-};
-
-template <DataType dt>
-inline bool is_selected(DataType data_type)
-{
- return dt == data_type;
-}
-
-template <DataType input_data_type, DataType output_data_type = input_data_type>
-static ElementwiseKernel generate_kernel(UKernelType *ukernel)
-{
- std::string kernel_name("op_");
- kernel_name += string_from_data_type(input_data_type) + "_";
- kernel_name += string_from_data_type(input_data_type) + "_";
- kernel_name += string_from_data_type(output_data_type);
-
- return { kernel_name.c_str(), is_selected<input_data_type>, ukernel };
-}
-
-template <ArithmeticOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(src1, dst);
- static ElementwiseKernel kernels[] =
- {
-#if defined(ENABLE_SVE)
- generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))),
- generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))),
- generate_kernel<DataType::S16>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
- generate_kernel<DataType::F32>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))),
- generate_kernel<DataType::S32>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))),
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
- generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))),
- generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))),
-#else /* !defined(__ARM_FEATURE_SVE2) */
- generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))),
- generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))),
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#if defined(ENABLE_SVE)
- generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- generate_kernel<DataType::F16>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))),
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- generate_kernel<DataType::S16>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))),
-#endif /* defined(ENABLE_NEON) */
- };
-
- for(const auto &uk : kernels)
- {
- if(uk.is_selected(src0->data_type()))
- {
- return uk.ukernel;
- }
- }
-
- return nullptr;
-}
-
-template <ComparisonOperation op>
-std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(src1, dst);
- static ElementwiseKernel kernels[] =
- {
-#if defined(ENABLE_SVE)
- generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))),
- generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))),
- generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))),
- generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
- generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))),
- generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))),
- generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))),
- generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))),
-#endif /* defined(ENABLE_NEON) */
-#if defined(__ARM_FEATURE_SVE2)
- generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))),
- generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))),
-#else /* !defined(__ARM_FEATURE_SVE2) */
- generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))),
- generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))),
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#if defined(ENABLE_SVE)
- generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))),
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))),
-#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- };
-
- for(const auto &uk : kernels)
- {
- if(uk.is_selected(src0->data_type()))
- {
- return uk.ukernel;
- }
- }
-
- return nullptr;
-}
-} // namespace
-
-Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
-
- const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
- "Wrong shape for output");
- }
-
- return Status{};
-}
-
-void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-
- // If any of shapes is dynamic, expect a configured window and dst at run-time.
- if(src0->is_dynamic() || src1->is_dynamic())
- {
- return;
- }
-
- auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape());
- auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type());
- ICpuKernel::configure(shape_and_window.second);
-}
-
-void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
-
- auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- auto function = get_implementation(src0->info(), src1->info(), dst->info());
- ARM_COMPUTE_ERROR_ON(function == nullptr);
- function(src0, src1, dst, window);
-}
-
-/** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
- _op = op;
-}
-
-Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
- }
- return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
- return Status{};
-}
-
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- switch(_op)
- {
- case ArithmeticOperation::MAX:
- return configure_arithm_func<ArithmeticOperation::MAX>(src0, src1, dst);
- case ArithmeticOperation::MIN:
- return configure_arithm_func<ArithmeticOperation::MIN>(src0, src1, dst);
- case ArithmeticOperation::SQUARED_DIFF:
- return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(src0, src1, dst);
- case ArithmeticOperation::PRELU:
- return configure_arithm_func<ArithmeticOperation::PRELU>(src0, src1, dst);
- case ArithmeticOperation::DIV:
- return configure_arithm_func<ArithmeticOperation::DIV>(src0, src1, dst);
- case ArithmeticOperation::POWER:
- return configure_arithm_func<ArithmeticOperation::POWER>(src0, src1, dst);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return nullptr;
-}
-
-/** The division operator */
-
-void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
- _op = ArithmeticOperation::DIV;
-}
-
-Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32);
- return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
- return Status{};
-}
-
-/** The power operator */
-void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
- _op = ArithmeticOperation::POWER;
-}
-
-Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32);
- return CpuArithmeticKernel::validate_arguments(src0, src1, dst);
-}
-
-Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
- return Status{};
-}
-
-/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
- configure_common(src0, src1, dst);
- _op = op;
-}
-
-Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
- }
- return validate_arguments_common(src0, src1, dst);
-}
-
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(op);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst));
- return Status{};
-}
-
-std::function<CpuElementwiseKernel::ElementwiseFunction>
-CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
-{
- switch(_op)
- {
- case ComparisonOperation::Equal:
- return configure_comp_func<ComparisonOperation::Equal>(src0, src1, dst);
- case ComparisonOperation::NotEqual:
- return configure_comp_func<ComparisonOperation::NotEqual>(src0, src1, dst);
- case ComparisonOperation::Greater:
- return configure_comp_func<ComparisonOperation::Greater>(src0, src1, dst);
- case ComparisonOperation::GreaterEqual:
- return configure_comp_func<ComparisonOperation::GreaterEqual>(src0, src1, dst);
- case ComparisonOperation::Less:
- return configure_comp_func<ComparisonOperation::Less>(src0, src1, dst);
- case ComparisonOperation::LessEqual:
- return configure_comp_func<ComparisonOperation::LessEqual>(src0, src1, dst);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return nullptr;
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
deleted file mode 100644
index 952c6e3e25..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseKernel.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
- *
- */
-class CpuElementwiseKernel : public ICpuKernel
-{
-public:
- const char *name() const override
- {
- return "CpuElementwiseKernel";
- }
-
- CpuElementwiseKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
-
- /** Common signature for all the specialised arithmetic functions
- *
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Dependent on subclass.
- * @param[in] window Region on which to execute the kernel.
- */
- using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
-protected:
- /** Validate the argument passed to the kernel
- *
- * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor. Data types supported: Dependent on subclass.
- */
- static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
- /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
- *
- */
- void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
- /** Function to get the micro kernel implementation
- *
- * @param[in] src0 First input tensor information
- * @param[in] src1 Second input tensor information
- * @param[in] dst Output tensor information
- *
- * @return the function instance for the micro kernel
- */
- virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0;
-};
-
-class CpuArithmeticKernel : public CpuElementwiseKernel
-{
-public:
- /** Default constructor */
- CpuArithmeticKernel() = default;
-
- /** Configure kernel
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
- */
- void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
- *
- * @return a Status
- */
- static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
- ArithmeticOperation _op{};
-
-private:
- /** Function to get the micro kernel implementation
- *
- * @param[in] src0 First input tensor information
- * @param[in] src1 Second input tensor information
- * @param[in] dst Output tensor information
- *
- * @return the function instance for the micro kernel
- */
- std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-};
-
-class CpuDivisionKernel : public CpuArithmeticKernel
-{
-public:
- /** Default constructor */
- CpuDivisionKernel() = default;
-
- /** Configure kernel
- *
- * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel
- *
- * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
- *
- * @return a Status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuPowerKernel : public CpuArithmeticKernel
-{
-public:
- /** Default constructor */
- CpuPowerKernel() = default;
-
- /** Configure kernel
- *
- * @param[in] src0 First tensor input info. Data types supported: F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: Same as @p src0.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel
- *
- * @param[in] src0 First tensor input info. Data types supported: F16/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src0.
- *
- * @return a Status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-};
-
-class CpuComparisonKernel : public CpuElementwiseKernel
-{
-public:
- /** Default constructor */
- CpuComparisonKernel() = default;
-
- /** Configure kernel
- *
- * @param[in] op Comparison operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[out] dst Output tensor info. Data types supported: U8.
- */
- void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
- *
- * @param[in] op Comparison operation to be executed.
- * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0.
- * @param[in] dst Output tensor info. Data types supported: U8.
- *
- * @return a Status
- */
- static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
-
-protected:
- // Inherited methods overridden:
- static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
-
-private:
- /** Function to get the micro kernel implementation
- *
- * @param[in] src0 First input tensor information
- * @param[in] src1 Second input tensor information
- * @param[in] dst Output tensor information
- *
- * @return the function instance for the micro kernel
- */
- std::function<ElementwiseFunction> get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override;
-
- ComparisonOperation _op{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
deleted file mode 100644
index 2600a49b70..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using ElementwiseUnarySelector = std::add_pointer<bool(DataType)>::type;
-
-struct ElementwiseUnaryKernel
-{
- const char *name;
- const ElementwiseUnarySelector is_selected;
- CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel;
-};
-
-static const ElementwiseUnaryKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "fp32_sve_elementwise_unary",
- [](DataType dt) { return dt == DataType::F32; },
- REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>),
- },
- {
- "fp16_sve_elementwise_unary",
- [](DataType dt) { return dt == DataType::F16; },
- REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>),
- },
- {
- "s32_sve_elementwise_unary",
- [](DataType dt) { return dt == DataType::S32; },
- REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
- },
-#endif // defined(ENABLE_SVE)
-#if defined(ENABLE_NEON)
- {
- "fp32_neon_elementwise_unary",
- [](DataType dt) { return dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>),
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "fp16_neon_elementwise_unary",
- [](DataType dt) { return dt == DataType::F16; },
- REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>),
- },
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "s32_neon_elementwise_unary",
- [](DataType dt) { return dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
- },
-#endif // defined(ENABLE_NEON)
-};
-
-const ElementwiseUnaryKernel *get_implementation(DataType dt)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected(dt))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-} // namespace
-
-CpuElementwiseUnaryKernel::CpuElementwiseUnaryKernel()
- : _op()
-{
-}
-
-void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
-{
- ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-
- _op = op;
-
- // If input shape is dynamic, expect a configured window and dst at run-time.
- if(src.is_dynamic())
- {
- return;
- }
-
- auto shape_and_window = compute_output_shape_and_window(src.tensor_shape());
- auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
- ICpuKernel::configure(shape_and_window.second);
-}
-
-Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-
- const auto *uk = get_implementation(src.data_type());
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- switch(op)
- {
- case ElementWiseUnary::EXP:
- case ElementWiseUnary::RSQRT:
- case ElementWiseUnary::LOG:
- case ElementWiseUnary::ROUND:
- case ElementWiseUnary::SIN:
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
- break;
- case ElementWiseUnary::NEG:
- case ElementWiseUnary::ABS:
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
- break;
- default:
- ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
- }
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
- }
-
- return Status{};
-}
-
-void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
-
- auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- auto func = get_implementation(src->info()->data_type())->ukernel;
- ARM_COMPUTE_ERROR_ON(func == nullptr);
- func(src, dst, window, _op);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
deleted file mode 100644
index ceb90dcf70..0000000000
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ dst(x) = OP(src(x))@f]
- *
- */
-class CpuElementwiseUnaryKernel : public ICpuKernel
-{
-public:
- const char *name() const override
- {
- return "CpuElementwiseUnaryKernel";
- }
- /** Default constructor */
- CpuElementwiseUnaryKernel();
- /** Default destructor */
- ~CpuElementwiseUnaryKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel);
-
- /** Function to configure the @ref CpuElementwiseUnaryKernel
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
- * @param[out] dst Output tensor. Data types supported: Same as @p src.
- */
- void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuElementwiseUnaryKernel
- *
- * @param[in] op Arithmetic operation to be executed.
- * @param[in] src First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
- * @param[in] dst Output tensor info. Data types supported: Same as @p src.
- *
- * @return a Status
- */
- static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
- /** Common signature for all the specialised elementwise unary micro-kernels
- *
- * @param[in] window Region on which to execute the kernel.
- */
- using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
-
-private:
- ElementWiseUnary _op;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFillKernel.cpp b/src/core/cpu/kernels/CpuFillKernel.cpp
deleted file mode 100644
index aab4d715ee..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFillKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
- _constant_value = constant_value;
-
- // Configure kernel window
- Window win = calculate_max_window(*tensor, Steps());
- ICpuKernel::configure(win);
-}
-
-void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST);
-
- // Collapse all the batches on the third dimension
- bool has_collapsed = true;
- Window collapsed = window.collapse_if_possible(window, Window::DimZ, &has_collapsed);
- ARM_COMPUTE_ERROR_ON(!has_collapsed);
-
- uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor);
- const auto window_width = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start());
- const size_t element_size = inout->info()->element_size();
-
- // Unroll X dimension
- collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator tensor_it(inout, collapsed);
- execute_window_loop(collapsed, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + tensor_it.offset();
- // Set memory
- for(int i = 0; i < window_width; ++i)
- {
- std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
- }
-
- },
- tensor_it);
-}
-
-const char *CpuFillKernel::name() const
-{
- return "CpuFillKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFillKernel.h b/src/core/cpu/kernels/CpuFillKernel.h
deleted file mode 100644
index 9afdee4186..0000000000
--- a/src/core/cpu/kernels/CpuFillKernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H
-#define ARM_COMPUTE_CPU_FILL_KERNEL_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel for filling a tensor with a given constant value */
-class CpuFillKernel : public ICpuKernel
-{
-public:
- CpuFillKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in,out] tensor Tensor to fill. Supported data types: All
- * @param[in] constant_value The value used to fill the planes of the tensor
- */
- void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- PixelValue _constant_value{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuFloorKernel.cpp b/src/core/cpu/kernels/CpuFloorKernel.cpp
deleted file mode 100644
index c2e9d48ce9..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuFloorKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/floor/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct FloorSelectorData
-{
- DataType dt;
-};
-
-using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type;
-using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
-
-struct FloorUKernel
-{
- const char *name;
- const FloorSelectorPtr is_selected;
- FloorUKernelPtr func;
-};
-
-static const FloorUKernel available_kernels[] =
-{
- {
- "fp16_neon_floor",
- [](const FloorSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
- },
- {
- "f32_neon_floor",
- [](const FloorSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
- },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const FloorUKernel *get_implementation(const FloorSelectorData &data)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected(data))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
- const auto *uk = get_implementation(FloorSelectorData{ src->data_type() });
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->func == nullptr);
-
- // Validate in case of configured output
- if(dst->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
- }
-
- return Status{};
-}
-} // namespace
-
-void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Auto initialize output
- auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
-
- // Validate
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
- // Configure kernel window
- const Window win = calculate_max_window(*src, Steps());
-
- ICPPKernel::configure(win);
-}
-
-Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_UNUSED(dst);
- ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst)));
-
- Window win;
- win.use_tensor_dimensions(src->tensor_shape());
- return win;
-}
-
-Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
- return Status{};
-}
-
-void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- ARM_COMPUTE_ERROR_ON(tensors.empty());
-
- const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
-
- const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
- const auto *ukernel = get_implementation(FloorSelectorData{ src->info()->data_type() });
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator src_it(src, win);
- Iterator dst_it(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- ukernel->func(src_it.ptr(), dst_it.ptr(), len);
- },
- src_it, dst_it);
-}
-
-const char *CpuFloorKernel::name() const
-{
- return "CpuFloorKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuFloorKernel.h b/src/core/cpu/kernels/CpuFloorKernel.h
deleted file mode 100644
index 2680871b45..0000000000
--- a/src/core/cpu/kernels/CpuFloorKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Cpu accelarated kernel to perform a floor operation */
-class CpuFloorKernel : public ICpuKernel
-{
-public:
- CpuFloorKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor. Data type supported: F16/F32.
- * @param[out] dst Destination tensor. Same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuFloorKernel
- *
- * @param[in] src Source tensor info. Data type supported: F16/F32.
- * @param[in] dst Destination tensor info. Same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
- /** Infer execution window
- *
- * @param[in] src Source tensor info. Data type supported: F16/F32.
- * @param[in] dst Destination tensor info. Same as @p src
- *
- * @return an execution Window
- */
- Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuMulKernel.cpp b/src/core/cpu/kernels/CpuMulKernel.cpp
deleted file mode 100644
index 82ec322875..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.cpp
+++ /dev/null
@@ -1,1729 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuMulKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-const float scale255_constant = 1.f / 255.f;
-const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
-const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
-
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_UNUSED(overflow_policy);
- ARM_COMPUTE_UNUSED(rounding_policy);
-
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
- DataType::S16, DataType::QSYMM16,
- DataType::S32, DataType::F16, DataType::F32);
- if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
- }
-
- if(dst->total_size() > 0)
- {
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
- // clang-format off
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
- !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
- !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
- !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
- !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
- !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
- , "Invalid data type combination");
- // clang-format on
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
- }
-
- if(std::abs(scale - scale255_constant) < 0.00001f)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
- "Scale == 1/255 is not supported if input and dst are of data type S32");
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
-
- int exponent = 0;
- const float normalized_mantissa = std::frexp(scale, &exponent);
-
- // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
- // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
- // Moreover, it will be negative as we deal with 1/2^n
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
- }
-
- return Status{};
-}
-
-/* Scales a given vector by 1/255.
- *
- * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
- *
- * @param in Input vector to scale.
- * @return Scaled dst rounded to nearest (round half up).
- */
-inline int32x4_t scale255_S32_S32(int32x4_t in)
-{
- // Scale
- const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
- // Round to nearest (round half up)
- // Add +0.5 for all values
- // Afterwards vcvt rounds toward zero
- return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
-}
-
-inline uint16x8_t scale255_U16_U16(uint16x8_t in)
-{
- const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
- const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
- return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
- return vquantize_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
-vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
-{
- return vquantize(val, info);
-}
-
-template <typename T>
-void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
- const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
- const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator dst(out, win);
-
- using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
- const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
- const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
-
- const float32x4x4_t out_f32x4x4 =
- {
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- // Quantize dst
- const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
- wrapper::vstore(output_ptr + x, result);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- const T src1 = *(non_broadcast_input_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
- const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
- const float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst
- const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
- *(output_ptr + x) = tmp_qua;
- }
- },
- broadcast_input, non_broadcast_input, dst);
- }
- else
- {
- const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto input1_q = wrapper::vloadq(input1_ptr + x);
- const auto input2_q = wrapper::vloadq(input2_ptr + x);
-
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
- const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
- const float32x4x4_t out_f32x4x4 =
- {
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- // Quantize dst
- const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
- wrapper::vstore(output_ptr + x, result);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- const T src1 = *(input1_ptr + x);
- const T src2 = *(input2_ptr + x);
- const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
- const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
- const float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst
- const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
- *(output_ptr + x) = tmp_qua;
- }
- },
- input1, input2, dst);
- }
-}
-
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
- const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
- const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const qsymm16x8x2_t input1_q =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const qsymm16x8x2_t input2_q =
- {
- {
- vld1q_s16(input2_ptr + x),
- vld1q_s16(input2_ptr + x + 8),
- }
- };
-
- // Dequantize inputs
- const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
- const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
- const float32x4x4_t out_f32x4x4 =
- {
- vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
- vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
- vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
- vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
- };
-
- const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Dequantize inputs
- float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
- float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
- float tmp_f = tmp_in1 * tmp_in2;
-
- // Quantize dst, lrintf() has same rounding mode as vcombine_s16
- int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
- qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- *(output_ptr + x) = tmp_qua;
- }
- },
- input1, input2, dst);
-}
-
-void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
-{
- ARM_COMPUTE_UNUSED(scale);
-
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const qsymm16x8x2_t input1_q =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const qsymm16x8x2_t input2_q =
- {
- {
- vld1q_s16(input2_ptr + x),
- vld1q_s16(input2_ptr + x + 8),
- }
- };
-
- const int32x4x4_t in1_s32 =
- {
- {
- vmovl_s16(vget_low_s16(input1_q.val[0])),
- vmovl_s16(vget_high_s16(input1_q.val[0])),
- vmovl_s16(vget_low_s16(input1_q.val[1])),
- vmovl_s16(vget_high_s16(input1_q.val[1])),
- }
- };
- const int32x4x4_t in2_s32 =
- {
- {
- vmovl_s16(vget_low_s16(input2_q.val[0])),
- vmovl_s16(vget_high_s16(input2_q.val[0])),
- vmovl_s16(vget_low_s16(input2_q.val[1])),
- vmovl_s16(vget_high_s16(input2_q.val[1])),
- }
- };
-
- const int32x4x4_t result =
- {
- {
- vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
- vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
- vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
- vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
- }
- };
-
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- vst1q_s32(output_ptr + x + 8, result.val[2]);
- vst1q_s32(output_ptr + x + 12, result.val[3]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
- *(output_ptr + x) = tmp;
- }
- },
- input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16 / sizeof(uint8_t);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
- const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
-
- uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
- const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
- uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
- const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
-
- tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
- tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
-
- if(is_scale255)
- {
- tmp1_high = scale255_U16_U16(tmp1_high);
- tmp1_low = scale255_U16_U16(tmp1_low);
- }
- else
- {
- const int16x8_t vn = vdupq_n_s16(-n);
-
- if(is_sat)
- {
- tmp1_high = vqshlq_u16(tmp1_high, vn);
- tmp1_low = vqshlq_u16(tmp1_low, vn);
- }
- else
- {
- tmp1_high = vshlq_u16(tmp1_high, vn);
- tmp1_low = vshlq_u16(tmp1_low, vn);
- }
- }
- if(is_sat)
- {
- vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
- }
- else
- {
- vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
- }
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
-
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
- tmp = static_cast<uint16_t>(tmp_f + 0.5f);
- }
- else
- {
- tmp >>= n;
- }
- if(is_sat && tmp > 255)
- {
- tmp = 255;
- }
- *(output_ptr + x) = static_cast<uint8_t>(tmp);
- }
- },
- input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
-{
- int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
- const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
- int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
- const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
-
- tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
- tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
-
- if(is_scale255)
- {
- tmp1_high = scale255_S32_S32(tmp1_high);
- tmp1_low = scale255_S32_S32(tmp1_low);
- }
- else
- {
- // Right shift amount
- const int32x4_t vn = vdupq_n_s32(-n);
- // Left shift amount
- const int32x4_t vnl = vdupq_n_s32(n);
- // Calculate conversion bit
- const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
- const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
- const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
- const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
- const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
- const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
- const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
- const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
- if(is_sat)
- {
- tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
- tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
- }
- else
- {
- tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
- tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
- }
- }
-
- if(is_sat)
- {
- return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
- }
- else
- {
- return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
- }
-}
-
-template <bool is_scale255, bool is_sat>
-inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
-{
- const int16x8x2_t result =
- {
- {
- // First 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
- // Second 8 elements
- mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
- }
- };
-
- return result;
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t ta1 =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const int16x8x2_t ta2 =
- {
- {
- vld1q_s16(input2_ptr + x),
- vld1q_s16(input2_ptr + x + 8),
- }
- };
- const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint32_t mask = (1u << n) - 1;
- tmp = (tmp + static_cast<int32_t>(mask)) >> n;
- }
- }
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- }
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
-}
-
-template <bool is_sat>
-inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
-{
- const int32x2_t input1_1 = vget_low_s32(src1);
- const int32x2_t input2_1 = vget_low_s32(src2);
- const int32x2_t input1_2 = vget_high_s32(src1);
- const int32x2_t input2_2 = vget_high_s32(src2);
-
- int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
- int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
-
- // Apply scaling, conversion and rounding (round to zero)
- // Right shift amount
- const int64x2_t vn = vdupq_n_s64(-n);
- // Left shift amount
- const int64x2_t vnl = vdupq_n_s64(n);
- // Calculate conversion bit
- const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
- const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
- const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
- const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
-
- const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
- const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
- const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
- const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
- if(is_sat)
- {
- tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
- tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
- return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
- }
- else
- {
- tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
- tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
- return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
- }
-}
-
-template <bool is_sat>
-inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
-{
- const int32x4x2_t result =
- {
- {
- // First 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
- // Second 4 elements
- mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
- }
- };
-
- return result;
-}
-
-template <bool is_sat>
-void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x2_t broadcast_v =
- {
- {
- broadcast_value_vec,
- broadcast_value_vec,
- }
- };
- const int32x4x2_t non_broadcast_v =
- {
- {
- vld1q_s32(non_broadcast_input_ptr + x),
- vld1q_s32(non_broadcast_input_ptr + x + 4),
- }
- };
- const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint64_t mask = ((uint64_t)1u << n) - 1;
- tmp = (tmp + static_cast<int64_t>(mask)) >> n;
- }
- if(is_sat)
- {
- tmp = utility::clamp<int64_t, int32_t>(tmp);
- }
- *(output_ptr + x) = static_cast<int32_t>(tmp);
- }
- },
- broadcast_input, non_broadcast_input, dst);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int32x4x2_t ta1 =
- {
- {
- vld1q_s32(input1_ptr + x),
- vld1q_s32(input1_ptr + x + 4),
- }
- };
- const int32x4x2_t ta2 =
- {
- {
- vld1q_s32(input2_ptr + x),
- vld1q_s32(input2_ptr + x + 4),
- }
- };
- const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
- vst1q_s32(output_ptr + x, result.val[0]);
- vst1q_s32(output_ptr + x + 4, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint64_t mask = ((uint64_t)1u << n) - 1;
- tmp = (tmp + static_cast<int64_t>(mask)) >> n;
- }
- if(is_sat)
- {
- tmp = utility::clamp<int64_t, int32_t>(tmp);
- }
- *(output_ptr + x) = static_cast<int32_t>(tmp);
- }
- },
- input1, input2, dst);
- }
-}
-
-void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- constexpr int window_step_x = 16 / sizeof(float);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
- using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
- const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
- }
- },
- broadcast_input, non_broadcast_input, dst);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto ta1 = wrapper::vloadq(input1_ptr + x);
- const auto ta2 = wrapper::vloadq(input2_ptr + x);
- const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
- const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto ta1 = *(input1_ptr + x);
- const auto ta2 = *(input2_ptr + x);
- *(output_ptr + x) = ta1 * ta2 * scale;
- }
- },
- input1, input2, dst);
- }
-}
-
-void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- constexpr int window_step_x = 8 / sizeof(float);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-
- using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
- float32x4_t b = vdupq_n_f32(broadcast_value);
-
- const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
- const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
- const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
- const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
- const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
- const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
- const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
- float32x4_t res = wrapper::vmul(tmp0, b);
- b = wrapper::vmul(b, mask);
-
- res = wrapper::vmla(res, tmp1, b);
- wrapper::vstore(output_ptr + 2 * x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
- const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
- auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
- auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
- *(output_ptr + 2 * x) = res1;
- *(output_ptr + 2 * x + 1) = res2;
- }
- },
- broadcast_input, non_broadcast_input, dst);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
- float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
-
- const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
- const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
- const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
- const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
- const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
-
- const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
- const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
-
- float32x4_t res = wrapper::vmul(tmp0, b);
-
- b = wrapper::vrev64(b);
- b = wrapper::vmul(b, mask);
-
- res = wrapper::vmla(res, tmp1, b);
- wrapper::vstore(output_ptr + 2 * x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto a0 = *(input1_ptr + 2 * x);
- const auto a1 = *(input1_ptr + 2 * x + 1);
- const auto b0 = *(input2_ptr + 2 * x);
- const auto b1 = *(input2_ptr + 2 * x + 1);
- auto res1 = a0 * b0 - a1 * b1;
- auto res2 = a0 * b1 + a1 * b0;
- *(output_ptr + 2 * x) = res1;
- *(output_ptr + 2 * x + 1) = res2;
- }
- },
- input1, input2, dst);
- }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- constexpr int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator dst(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
- const float16x8x2_t broadcast_value_vec =
- {
- {
- vdupq_n_f16(broadcast_value),
- vdupq_n_f16(broadcast_value),
- }
- };
- const auto scale_vec = vdupq_n_f16(scale);
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t non_broadcast_v =
- {
- {
- vld1q_f16(non_broadcast_input_ptr + x),
- vld1q_f16(non_broadcast_input_ptr + x + 8),
- }
- };
- const float16x8x2_t result =
- {
- {
- vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
- vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
- }
- };
- vst1q_f16(output_ptr + x, result.val[0]);
- vst1q_f16(output_ptr + x + 8, result.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
- }
- },
- broadcast_input, non_broadcast_input, dst);
- }
- else
- {
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float16x8x2_t ta1 =
- {
- {
- vld1q_f16(input1_ptr + x),
- vld1q_f16(input1_ptr + x + 8),
- }
- };
- const float16x8x2_t ta2 =
- {
- {
- vld1q_f16(input2_ptr + x),
- vld1q_f16(input2_ptr + x + 8),
- }
- };
- const float16x8_t scale_vec = vdupq_n_f16(scale);
- const float16x8x2_t result =
- {
- {
- vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
- vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
- }
- };
- vst1q_f16(output_ptr + x, result.val[0]);
- vst1q_f16(output_ptr + x + 8, result.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto ta1 = *(input1_ptr + x);
- const auto ta2 = *(input2_ptr + x);
- *(output_ptr + x) = ta1 * ta2 * scale;
- }
- },
- input1, input2, dst);
- }
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16 / sizeof(uint8_t);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
- const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
-
- uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
- uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
- tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
- tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
-
- if(is_scale255)
- {
- tmp_low = scale255_U16_U16(tmp_low);
- tmp_high = scale255_U16_U16(tmp_high);
- }
- else
- {
- const int16x8_t vn = vdupq_n_s16(-n);
-
- if(is_sat)
- {
- tmp_low = vqshlq_u16(tmp_low, vn);
- tmp_high = vqshlq_u16(tmp_high, vn);
- }
- else
- {
- tmp_low = vshlq_u16(tmp_low, vn);
- tmp_high = vshlq_u16(tmp_high, vn);
- }
- }
-
- if(is_sat)
- {
- static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
-
- tmp_low = vminq_u16(tmp_low, max);
- tmp_high = vminq_u16(tmp_high, max);
- }
-
- vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
- vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- tmp >>= n;
- }
-
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
- }
-
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src1, input1_win);
- Iterator input2(src2, input2_win);
- Iterator dst(out, win);
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8x2_t ta1 =
- {
- {
- vld1q_s16(input1_ptr + x),
- vld1q_s16(input1_ptr + x + 8),
- }
- };
- const uint8x8x2_t ta2u =
- {
- {
- vld1_u8(input2_ptr + x),
- vld1_u8(input2_ptr + x + 8),
- }
- };
- const int16x8x2_t ta2 =
- {
- {
- vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
- vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
- }
- };
-
- const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
-
- vst1q_s16(output_ptr + x, result.val[0]);
- vst1q_s16(output_ptr + x + 8, result.val[1]);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
- if(is_scale255)
- {
- float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
- tmp = static_cast<int32_t>(tmp_f + 0.5f);
- }
- else
- {
- if(tmp >= 0)
- {
- tmp >>= n;
- }
- else
- {
- uint32_t mask = (1u << n) - 1;
- tmp = (tmp + static_cast<int32_t>(mask)) >> n;
- }
- }
- if(is_sat)
- {
- tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
- }
- *(output_ptr + x) = static_cast<int16_t>(tmp);
- }
- },
- input1, input2, dst);
-}
-
-template <bool is_scale255, bool is_sat>
-void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
-{
- // Simply swap the two input buffers
- mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
-}
-} // namespace
-
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_UNUSED(rounding_policy);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- set_shape_if_empty(*dst, out_shape);
-
- _scale = scale;
- _scale_exponent = 0;
- _func_quantized = nullptr;
- _func_int = nullptr;
- _func_float = nullptr;
-
- bool is_scale_255 = false;
- // Check and validate scaling factor
- if(std::abs(scale - scale255_constant) < 0.00001f)
- {
- is_scale_255 = true;
- }
- else
- {
- int exponent = 0;
-
- std::frexp(scale, &exponent);
-
- // Store the positive exponent. We know that we compute 1/2^n
- // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
- _scale_exponent = std::abs(exponent - 1);
- }
-
- const DataType dt_input1 = src1->data_type();
- const DataType dt_input2 = src2->data_type();
- const DataType dt_output = dst->data_type();
- const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
-
- switch(dt_input1)
- {
- case DataType::QASYMM8:
- if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
- {
- _func_quantized = &mul_saturate_quantized_8<uint8_t>;
- }
- break;
- case DataType::QASYMM8_SIGNED:
- if(dt_input2 == DataType::QASYMM8_SIGNED)
- {
- _func_quantized = &mul_saturate_quantized_8<int8_t>;
- ;
- }
- break;
- case DataType::QSYMM16:
- if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
- {
- _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
- }
- else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
- {
- _func_int = &mul_QSYMM16_QSYMM16_S32;
- }
- break;
- case DataType::S16:
- if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
- {
- if(is_scale_255)
- {
- _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
- }
- else
- {
- _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
- }
- }
- if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
- {
- if(is_scale_255)
- {
- _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
- }
- else
- {
- _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
- }
- }
- break;
- case DataType::S32:
- if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
- {
- _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
- }
- break;
- case DataType::U8:
- if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
- {
- if(is_scale_255)
- {
- _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
- }
- else
- {
- _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
- }
- }
- else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
- {
- if(is_scale_255)
- {
- _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
- }
- else
- {
- _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
- }
- }
- else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
- {
- if(is_scale_255)
- {
- _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
- }
- else
- {
- _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
- }
- }
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- _func_float = &mul_F16_F16_F16;
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- _func_float = &mul_F32_F32_F32;
- break;
- default:
- ARM_COMPUTE_ERROR("You called with the wrong img formats");
- }
-
- // Configure kernel window
- Window win = calculate_max_window(out_shape);
-
- ICpuKernel::configure(win);
-}
-
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
- RoundingPolicy rounding_policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
-
- return Status{};
-}
-
-void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- if(_func_quantized != nullptr)
- {
- (*_func_quantized)(src1, src2, dst, window, _scale);
- }
- else if(_func_int != nullptr)
- {
- (*_func_int)(src1, src2, dst, window, _scale_exponent);
- }
- else
- {
- ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
- (*_func_float)(src1, src2, dst, window, _scale);
- }
-}
-const char *CpuMulKernel::name() const
-{
- return "CpuMulKernel";
-}
-namespace
-{
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- // Validate in case of configured dst
- if(dst->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
- }
-
- return Status{};
-}
-} // namespace
-
-void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-
- // Auto initialize dst if not initialized
- const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
- auto_init_if_empty(*dst, out_info);
-
- // Configure kernel window
- Window win = calculate_max_window(out_shape);
-
- ICpuKernel::configure(win);
-}
-
-Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
-
- return Status{};
-}
-
-void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- c_mul_F32_F32_F32_n(src1, src2, dst, window);
-}
-
-const char *CpuComplexMulKernel::name() const
-{
- return "CpuComplexMulKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuMulKernel.h b/src/core/cpu/kernels/CpuMulKernel.h
deleted file mode 100644
index 3e667bc4be..0000000000
--- a/src/core/cpu/kernels/CpuMulKernel.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H
-#define ARM_COMPUTE_CPU_MUL_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform multiplication between two tensors */
-class CpuMulKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuMulKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel);
- /** Initialise the kernel's input, dst and border mode.
- *
- * Valid configurations (Src1,Src2) -> Dst :
- *
- * Support: Broadcast? Scale=1/255?
- * - (U8,U8) -> U8, S16 N Y
- * - (U8,S16) -> S16 N Y
- * - (S16,U8) -> S16 N Y
- * - (S16,S16) -> S16 N Y
- * - (S32,S32) -> S32 Y N
- * - (F16,F16) -> F16 N Y
- * - (F32,F32) -> F32 Y Y
- * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y
- * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y
- * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y
- *
- * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
- * For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
- *
- * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
- * @param[in] scale Scale to apply after multiplication.
- * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
- * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
- * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
- * @param[in] rounding_policy Rounding policy.
- */
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuMulKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
- // Inherited methods overridden
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- /** Common signature for all the specialised multiplication functions with integer scaling factor
- *
- * @param[in] src1 Src1 tensor object.
- * @param[in] src2 Src2 tensor object.
- * @param[out] dst Dst tensor object.
- * @param[in] window Region on which to execute the kernel
- * @param[in] scale Integer scale factor.
- */
- using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
- /** Common signature for all the specialised multiplication functions with float scaling factor
- *
- * @param[in] src1 Src1 tensor object.
- * @param[in] src2 Src2 tensor object.
- * @param[out] dst Dst tensor object.
- * @param[in] window Region on which to execute the kernel
- * @param[in] scale Float scale factor.
- */
- using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
- /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
- *
- * @param[in] src1 Src1 tensor object.
- * @param[in] src2 Src2 tensor object.
- * @param[out] dst Dst tensor object.
- * @param[in] window Region on which to execute the kernel
- * @param[in] scale Float scale factor.
- *
- */
- using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
-
- MulFunctionFloat *_func_float{ nullptr };
- MulFunctionInt *_func_int{ nullptr };
- MulFunctionQuantized *_func_quantized{ nullptr };
- float _scale{ 0 };
- int _scale_exponent{ 0 };
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CpuComplexMulKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuComplexMulKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel);
- /** Initialise the kernel's src, dst and border mode.
- *
- * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
- * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
- * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
- */
- void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuComplexMulKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.cpp b/src/core/cpu/kernels/CpuPermuteKernel.cpp
deleted file mode 100644
index 270d6e222e..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPermuteKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace
-{
-#include "src/core/NEON/kernels/convolution/common/shims.hpp"
-} // namespace
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-inline bool is_permutation_supported(const PermutationVector &v)
-{
- static const std::array<PermutationVector, 2> permutations2 =
- {
- {
- PermutationVector(0U, 1U),
- PermutationVector(1U, 0U),
- }
- };
- static const std::array<PermutationVector, 6> permutations3 =
- {
- {
- PermutationVector(2U, 0U, 1U),
- PermutationVector(1U, 2U, 0U),
- PermutationVector(0U, 1U, 2U),
- PermutationVector(0U, 2U, 1U),
- PermutationVector(1U, 0U, 2U),
- PermutationVector(2U, 1U, 0U),
- }
- };
- static const std::array<PermutationVector, 24> permutations4 =
- {
- {
- PermutationVector(0U, 1U, 2U, 3U),
- PermutationVector(1U, 0U, 2U, 3U),
- PermutationVector(2U, 0U, 1U, 3U),
- PermutationVector(0U, 2U, 1U, 3U),
- PermutationVector(1U, 2U, 0U, 3U),
- PermutationVector(2U, 1U, 0U, 3U),
- PermutationVector(2U, 1U, 3U, 0U),
- PermutationVector(1U, 2U, 3U, 0U),
- PermutationVector(3U, 2U, 1U, 0U),
- PermutationVector(2U, 3U, 1U, 0U),
- PermutationVector(1U, 3U, 2U, 0U),
- PermutationVector(3U, 1U, 2U, 0U),
- PermutationVector(3U, 0U, 2U, 1U),
- PermutationVector(0U, 3U, 2U, 1U),
- PermutationVector(2U, 3U, 0U, 1U),
- PermutationVector(3U, 2U, 0U, 1U),
- PermutationVector(0U, 2U, 3U, 1U),
- PermutationVector(2U, 0U, 3U, 1U),
- PermutationVector(1U, 0U, 3U, 2U),
- PermutationVector(0U, 1U, 3U, 2U),
- PermutationVector(3U, 1U, 0U, 2U),
- PermutationVector(1U, 3U, 0U, 2U),
- PermutationVector(0U, 3U, 1U, 2U),
- PermutationVector(3U, 0U, 1U, 2U)
- }
- };
-
- return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
- || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported.");
-
- const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
-
- // Validate configured destination
- if(dst->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- return Status{};
-}
-
-template <typename T>
-void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm)
-{
- const DataLayout src_layout = src->info()->data_layout();
-
- // Source window
- Window window_src = window;
-
- // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
- // we have to fall back to C++
- if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
- {
- window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
- window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
- window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
- window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
- }
-
- // Destination window
- Window window_dst(window);
- const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
- for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
- {
- window_dst.set(d, zero_window);
- }
-
- // Create iterators
- Iterator src_it(src, window_src);
- Iterator dst_it(dst, window_dst);
-
- int in_row_stride = 0;
- int in_col_stride = 0;
- int in_channel_stride = 0;
- int in_batch_stride = 0;
- int n_cols = 0;
- int n_rows = 0;
- int n_channels = 0;
- int n_batches = 0;
-
- switch(src_layout)
- {
- case DataLayout::NCHW:
- {
- in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T);
- in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T);
- in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
- n_cols = src->info()->tensor_shape().x();
- n_rows = window_src.y().step();
- n_channels = src->info()->tensor_shape().z();
- n_batches = src->info()->tensor_shape()[3];
- break;
- }
- case DataLayout::NHWC:
- {
- in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T);
- in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T);
- in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T);
- n_channels = src->info()->tensor_shape().x();
- n_cols = window_src.y().step();
- n_rows = src->info()->tensor_shape().z();
- n_batches = src->info()->tensor_shape()[3];
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Invalid source data layout.");
- break;
- }
- }
-
- // CHW -> HWC
- if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
- {
- const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
- const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T);
- const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
- const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T);
- execute_window_loop(window_src, [&](const Coordinates & id)
- {
- const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
- reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
- n_batches, n_channels, n_rows, n_cols,
- in_batch_stride, in_channel_stride, in_row_stride,
- out_batch_stride, out_row_stride, out_col_stride);
- },
- src_it, dst_it);
- }
- // HWC -> CHW
- else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
- {
- const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
- const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T);
- const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
- const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T);
- execute_window_loop(window_src, [&](const Coordinates & id)
- {
- const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
- reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
- n_batches, n_rows, n_cols, n_channels,
- in_batch_stride, in_row_stride, in_col_stride,
- out_batch_stride, out_channel_stride, out_row_stride);
- },
- src_it, dst_it);
- }
- else
- {
- // All other cases fall back to C++
- // Permute strides
- Strides strides = dst->info()->strides_in_bytes();
- Strides perm_strides = strides;
- permute_strides(perm_strides, perm);
- const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
- *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
- },
- src_it, dst_it);
- }
-}
-} // namespace
-
-void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
- // Destination auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
-
- _perm = perm;
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps());
-
- // This kernel doesn't need padding so update_window_and_padding() can be skipped
-
- ICpuKernel::configure(win);
-}
-
-Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
- return Status{};
-}
-
-void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(src->info()->element_size())
- {
- case 1:
- run_permute<uint8_t>(window, src, dst, _perm);
- break;
- case 2:
- run_permute<uint16_t>(window, src, dst, _perm);
- break;
- case 4:
- run_permute<uint32_t>(window, src, dst, _perm);
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-}
-
-const char *CpuPermuteKernel::name() const
-{
- return "CpuPermuteKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPermuteKernel.h b/src/core/cpu/kernels/CpuPermuteKernel.h
deleted file mode 100644
index 9c59d5b9d4..0000000000
--- a/src/core/cpu/kernels/CpuPermuteKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel to perform tensor permutation given a permutation vector */
-class CpuPermuteKernel : public ICpuKernel
-{
-public:
- CpuPermuteKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel);
- /** Configure kernel for a given list of arguments
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] src Srouce tensor to permute. Data types supported: All
- * @param[out] dst Destination tensor. Data types supported: Same as @p src
- * @param[in] perm Permutation vector
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuPermuteKernel
- *
- * @note Arbitrary permutation vectors are supported with rank not greater than 4
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: Same as @p src
- * @param[in] perm Permutation vector
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- PermutationVector _perm{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
deleted file mode 100644
index e6f5890685..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuPool2dKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/ToolchainSupport.h"
-
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-using namespace misc::shape_calculator;
-
-struct PoolingSelectorData
-{
- DataType dt;
- DataLayout dl;
- int pool_stride_x;
- Size2D pool_size;
-};
-
-using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type;
-using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
-struct PoolingKernel
-{
- const char *name;
- const PoolingSelectorPtr is_selected;
- PoolingKernelPtr ukernel;
-};
-
-static const PoolingKernel available_kernels[] =
-{
- {
- "poolingMxN_qasymm8_neon_nhwc",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
- },
- {
- "poolingMxN_qasymm8_signed_neon_nhwc",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "poolingMxN_fp16_neon_nhwc",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); },
- REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "poolingMxN_fp32_neon_nhwc",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
- },
-#if defined(ENABLE_NCHW_KERNELS)
- {
- "pooling2_qasymm8_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
- },
- {
- "pooling3_qasymm8_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
- },
- {
- "poolingMxN_qasymm8_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
- },
- {
- "pooling2_qasymm8_signed_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
- },
- {
- "pooling3_qasymm8_signed_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
- },
- {
- "poolingMxN_qasymm8_signed_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "pooling2_fp16_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
- REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
- },
- {
- "pooling3_fp16_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
- REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
- },
- {
- "poolingMxN_fp16_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); },
- REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "pooling2_fp32_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
- },
- {
- "pooling3_fp32_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
- },
- {
- "pooling7_fp32_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
- REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
- },
- {
- "poolingMxN_fp32_neon_nchw",
- [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
- },
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected({ dt, dl, pool_stride_x, pool_size }))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
- const ITensorInfo *indices, Size2D pool_size)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0);
-
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- int output_width = 0;
- int output_height = 0;
- PoolingType pool_type = pool_info.pool_type;
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
- pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
-
- TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
- std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- if(indices)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
- }
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
- && (src->data_layout() == DataLayout::NHWC),
- "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
-
- if(dst->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
- if(indices)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
- }
- }
-
- const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size);
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
- unsigned int &num_elems_processed_per_iteration,
- BorderSize &border_size,
- int pool_size_x, int pool_size_y)
-{
- // dst auto inizialitation if not yet initialized
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
- if(indices)
- {
- // Indices auto inizialitation if not yet initialized
- auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
- pool_info)))
- .set_data_type(DataType::U32) /* we store the offset to the element */);
- }
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
- unsigned int num_elems_read_per_iteration = 0;
- unsigned int num_elems_horizontal_window = 0;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int src_width = src->dimension(idx_width);
- const int src_height = src->dimension(idx_height);
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
- const int pool_pad_right = pad_stride_info.pad_right();
- const int pool_pad_top = pad_stride_info.pad_top();
- const int pool_pad_left = pad_stride_info.pad_left();
- const int pool_pad_bottom = pad_stride_info.pad_bottom();
- const bool is_square = pool_size_x == pool_size_y;
- const unsigned int pooled_w = dst->dimension(idx_width);
- const unsigned int pooled_h = dst->dimension(idx_height);
-
- //If it's not squared and optimized will be executed the MxN
- num_elems_read_per_iteration = 1;
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
-
- if(is_square)
- {
- switch(src->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- switch(pool_size_x)
- {
- case 2:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- break;
- case 3:
- num_elems_read_per_iteration = 16;
- num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14;
- num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16;
- break;
- default:
- break;
- }
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- switch(pool_size_x)
- {
- case 2:
- case 3:
- num_elems_read_per_iteration = 4;
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
- break;
- default:
- break;
- }
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- switch(pool_size_x)
- {
- case 2:
- num_elems_read_per_iteration = 2;
- break;
- case 3:
- num_elems_read_per_iteration = 4; // We use vload4 for pooling3
- break;
- case 7:
- num_elems_read_per_iteration = 8; // We use vload8 for pooling7
- break;
- default:
- break;
- }
- num_elems_processed_per_iteration = 1;
- num_elems_horizontal_window = 1;
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
- }
-
- bool window_changed = false;
- Window win{};
- if(data_layout == DataLayout::NCHW)
- {
- // Number of iterations in X dimension
- const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
- // Upper limit for the number of right/bottom border elements that are accessed
- const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
- const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
- border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
- border_size.right = std::max(upper_bound_w, pool_pad_right);
- border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
- TensorShape dst_shape{ src->tensor_shape() };
- dst_shape.set(0, pooled_w);
- dst_shape.set(1, pooled_h);
- TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
- win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom);
- AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window);
- if(indices)
- {
- AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window);
- window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
- }
- else
- {
- window_changed = update_window_and_padding(win, src_access, dst_access);
- }
- dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-
- border_size = src->padding();
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-} // namespace
-
-BorderSize CpuPool2dKernel::border_size() const
-{
- return _border_size;
-}
-
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
- const bool is_global_pooling = pool_info.is_global_pooling;
-
- // Get data layout
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- // Update pool size in case of global pooling
- const Size2D pool_size(
- is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
- is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
-
- // Set instance variables
- _pool_info = pool_info;
- _data_layout = src->data_layout();
- _pool_size = pool_size;
- _pool_stride_x = pad_stride_info.stride().first;
-
- if(_data_layout == DataLayout::NHWC)
- {
- // Configure kernel window
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
- }
- else
- {
- // Configure kernel window
- auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
- _border_size, pool_size.x(), pool_size.y());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICpuKernel::configure(win_config.second);
- }
-}
-
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-
- unsigned int num_elems_processed_per_iteration = 0;
- BorderSize border_size(0);
-
- const bool is_global_pooling = pool_info.is_global_pooling;
-
- // Get data layout
- const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
- unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
- (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size,
- pool_size_x, pool_size_y)
- .first);
-
- return Status{};
-}
-
-void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0);
- ITensor *indices = tensors.get_tensor(TensorType::ACL_DST_1);
-
- const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first;
- const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second;
- const unsigned int pool_size = _pool_info.pool_size.width;
-
- Window window_src(window);
- if(_data_layout == DataLayout::NCHW)
- {
- // Set step for src in x and y direction for the src
- unsigned int window_x_inc = 0;
- switch(src->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- {
- window_x_inc = pool_stride_x;
- if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
- {
- window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
- }
- break;
- }
-
- case DataType::F16:
- case DataType::F32:
- {
- window_x_inc = pool_stride_x;
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
- window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
- window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
- }
- else
- {
- window_src.set(Window::DimX, Window::Dimension(0, 1, 1));
- window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
- window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
- }
-
- const auto *uk = get_implementation(src->info()->data_type(), _data_layout, _pool_stride_x, _pool_size);
- ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- uk->ukernel(src, dst, indices, _pool_info, window_src, window);
-}
-
-const char *CpuPool2dKernel::name() const
-{
- return "CpuPool2dKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuPool2dKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
deleted file mode 100644
index 95298004e9..0000000000
--- a/src/core/cpu/kernels/CpuPool2dKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the pooling layer kernel */
-class CpuPool2dKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuPool2dKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
- /** Configure kernel for a given list of arguments
- *
- * @note F16 are supported for pool sizes 2 and 3 only
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: Same as @p src.
- * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
- * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32.
- */
- void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuPool2dKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- BorderSize border_size() const override;
- const char *name() const override;
-
-private:
- PoolingLayerInfo _pool_info{};
- DataLayout _data_layout{ DataLayout::UNKNOWN };
- unsigned int _num_elems_processed_per_iteration{ 0 };
- BorderSize _border_size{ 0 };
- Size2D _pool_size{};
- int _pool_stride_x{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.cpp b/src/core/cpu/kernels/CpuQuantizeKernel.cpp
deleted file mode 100644
index 8ca81e8b11..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuQuantizeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-constexpr auto window_step = 16;
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-
- return Status{};
-}
-
-template <typename T>
-inline float32x4x4_t load_value(const T *input_ptr)
-{
- using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
- return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
-}
-
-template <>
-inline float32x4x4_t load_value(const float *input_ptr)
-{
- return { wrapper::vloadq(input_ptr),
- wrapper::vloadq(input_ptr + 4),
- wrapper::vloadq(input_ptr + 8),
- wrapper::vloadq(input_ptr + 12) };
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4x4_t load_value(const float16_t *input_ptr)
-{
- return { vcvt_f32_f16(wrapper::vload(input_ptr)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
- vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename element_type>
-using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
-
-template <typename quantized_type>
-vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
-
-template <>
-vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
- return vquantize(qv, qi);
-}
-
-template <>
-vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
- return vquantize_signed(qv, qi);
-}
-
-} // namespace
-
-void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-
- static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
- {
- { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
- { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
- { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
-
- { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
- { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
- { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
-
- { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
- { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
- { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
- { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
- { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
- };
-
- std::string function_to_call("op_");
- function_to_call += string_from_data_type(src->data_type()) + "_";
- function_to_call += string_from_data_type(dst->data_type());
-
- auto it = quant_map.find(function_to_call);
-
- if(it == quant_map.end())
- {
- ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
- }
- _func = it->second;
-
- // Configure kernel window
- Window win_config = calculate_max_window(*src, Steps());
- ICpuKernel::configure(win_config);
-}
-
-Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
- return Status{};
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
- UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if(is_data_type_quantized_asymmetric(src->info()->data_type()))
- {
- uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
- }
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<const TIn *>(input.ptr());
- auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step); x += window_step)
- {
- wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
- }
- },
- input, output);
-}
-
-template <typename T>
-void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
- UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform();
- if(is_data_type_quantized_asymmetric(src->info()->data_type()))
- {
- uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
- }
-#ifdef __aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else //__aarch64__
- constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
- // Collapse window and reset first dimension to handle tail calculations manually
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
- int x = window_start_x;
- for(; x <= (window_end_x - window_step); x += window_step)
- {
- uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
- vst1q_u16(&output_ptr[x], tmp.val[0]);
- vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
- }
- },
- input, output);
-}
-
-void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- (this->*_func)(src, dst, window);
-}
-
-const char *CpuQuantizeKernel::name() const
-{
- return "CpuQuantizeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizeKernel.h b/src/core/cpu/kernels/CpuQuantizeKernel.h
deleted file mode 100644
index d3422d3fbd..0000000000
--- a/src/core/cpu/kernels/CpuQuantizeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- */
-class CpuQuantizeKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuQuantizeKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel);
- /** Set the input, output.
- *
- * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
- * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
- *
- * @note Output auto initialization is not supported by this kernel
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to @ref CpuQuantizeKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- /** Common signature for all the specialised @ref CpuQuantizeKernel functions
- *
- * @param[in] window Region on which to execute the kernel.
- */
- using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
- /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
- *
- * @param[in] window Region on which to execute the kernel.
- */
- template <typename TIn, typename TOut>
- void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
- /** Function to apply QASYMM16 quantization on a tensor.
- *
- * @param[in] window Region on which to execute the kernel.
- */
- template <typename T>
- void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
-
- QuantizeFunctionExecutorPtr _func{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.cpp b/src/core/cpu/kernels/CpuReshapeKernel.cpp
deleted file mode 100644
index 5b717b9bba..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstdint>
-
-/** [NEReshapeLayerKernel Kernel] **/
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
- // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
- if(dst->tensor_shape().total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
- }
-
- return Status{};
-}
-
-template <typename T>
-inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst)
-{
- const TensorShape &src_shape = src->info()->tensor_shape();
- const TensorShape &dst_shape = dst->info()->tensor_shape();
- Coordinates dst_coord{};
-
- Iterator src_it(src, window);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- dst_coord = index2coords(dst_shape, coords2index(src_shape, id));
- *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr());
- },
- src_it);
-}
-} // namespace
-
-void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
- ARM_COMPUTE_UNUSED(dst);
-
- // Configure kernel window
- Window win = calculate_max_window(*src);
-
- ICpuKernel::configure(win);
-}
-
-Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
-
- return Status{};
-}
-
-void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(src->info()->data_type())
- {
- case DataType::U8:
- case DataType::S8:
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- reshape_tensor<uint8_t>(window, src, dst);
- break;
- case DataType::U16:
- case DataType::S16:
- case DataType::F16:
- reshape_tensor<uint16_t>(window, src, dst);
- break;
- case DataType::U32:
- case DataType::S32:
- case DataType::F32:
- reshape_tensor<uint32_t>(window, src, dst);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported data type!");
- }
-}
-
-const char *CpuReshapeKernel::name() const
-{
- return "CpuReshapeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-/** [NEReshapeLayerKernel Kernel] **/
diff --git a/src/core/cpu/kernels/CpuReshapeKernel.h b/src/core/cpu/kernels/CpuReshapeKernel.h
deleted file mode 100644
index add6782b9e..0000000000
--- a/src/core/cpu/kernels/CpuReshapeKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform tensor reshaping */
-class CpuReshapeKernel : public ICpuKernel
-{
-public:
- CpuReshapeKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Source tensor info. Data type supported: All
- * @param[out] dst Destination tensor info. Data type supported: Same as @p input
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
-
- /** Static function to check if given info will lead to a valid configuration of @ref CpuReshapeKernel
- *
- * @param[in] src Source tensor info. Data type supported: All
- * @param[in] dst Destination tensor info. Data type supported: Same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
deleted file mode 100644
index 29475fa63f..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuScaleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/scale/neon/list.h"
-#include "src/core/cpu/kernels/scale/sve/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <map>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct ScaleSelectorData
-{
- DataType dt;
-};
-using ScaleSelectorPtr = std::add_pointer<bool(const ScaleSelectorData &data)>::type;
-using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
- InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
-struct ScaleKernel
-{
- const char *name;
- const ScaleSelectorPtr is_selected;
- ScaleKernelPtr ukernel;
-};
-
-static const ScaleKernel available_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "fp16_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
- },
- {
- "f32_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
- },
- {
- "qasymm8_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
- },
- {
- "qasymm8_signed_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
- },
- {
- "u8_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
- },
- {
- "s16_sve_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
- },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "common_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
- },
-#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "common_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
- },
- {
- "qasymm8_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
- },
- {
- "qasymm8_signed_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
- },
- {
- "common_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>)
- },
- {
- "common_neon_scale",
- [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
- },
-#endif /* defined(ENABLE_NEON) */
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const ScaleKernel *get_implementation(const ScaleSelectorData &data)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected(data))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
- const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- const auto *uk = get_implementation(ScaleSelectorData{ src->data_type() });
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
- ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
- ARM_COMPUTE_UNUSED(info.constant_border_value);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
-
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const auto output_width = dst->dimension(width_index);
- const auto output_height = dst->dimension(height_index);
- ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
- ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
-
- if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
- }
-
- if(info.interpolation_policy == InterpolationPolicy::BILINEAR)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
- if(dx != nullptr && dy != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-
- if(info.interpolation_policy == InterpolationPolicy::AREA)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
- }
-
- return Status{};
-}
-} // namespace
-
-CpuScaleKernel::CpuScaleKernel()
- : _func(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0), _align_corners(false), _data_layout(DataLayout::UNKNOWN)
-{
-}
-
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
- ITensorInfo *dst, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_UNUSED(dx, dy, offsets);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
- dx,
- dy,
- offsets,
- dst,
- info));
-
- // Get data layout and width/height indices
- _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- _policy = info.interpolation_policy;
- _border_mode = info.border_mode;
- _constant_border_value = info.constant_border_value;
- _align_corners = info.align_corners;
-
- if(info.sampling_policy == SamplingPolicy::CENTER)
- {
- _sampling_offset = 0.5f;
- }
-
- // Compute the ratio between source width/height and destination width/height
- const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
- const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
-
- // Area interpolation behaves as Nearest Neighbour in case of up-sampling
- _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
-
- if(_border_mode == BorderMode::UNDEFINED)
- {
- _border_mode = BorderMode::CONSTANT;
- _constant_border_value = PixelValue();
- }
-
-#ifdef ENABLE_NCHW_KERNELS
- // Configure scale function to run
- if(_data_layout == DataLayout::NCHW)
- {
- std::string function_to_call("scale_");
- function_to_call += string_from_data_type(src->data_type()) + "_";
- function_to_call += string_from_data_layout(_data_layout) + "_";
- function_to_call += string_from_interpolation_policy(_policy);
-
- static std::map<std::string, ScaleFunctionPtr> map_function =
- {
- { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
-
- { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
- { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
- { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
- { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
-
- { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
- { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
-
- { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
- { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
- { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
- { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
- { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
- };
- auto it = map_function.find(function_to_call);
- if(it != map_function.end())
- {
- _func = it->second;
- }
- }
-#endif // ENABLE_NCHW_KERNELS
-
- // Configure window
- Window win = calculate_max_window(*dst, Steps());
- ICpuKernel::configure(win);
-}
-
-#ifdef ENABLE_NCHW_KERNELS
-template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dx, dy);
- const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Set offsets window
- Window win_off;
- win_off.set(Window::DimX, window[Window::DimX]);
- win_off.set(Window::DimY, window[Window::DimY]);
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- // Create iterators
- Iterator src_i(src, win_in);
- Iterator dst_i(dst, window);
- Iterator offsets_i(offsets, win_off);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
- const auto in_yi = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
- id.y() + _sampling_offset)
- * hr));
- const int32_t offset_row = in_yi * in_stride_x;
- *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
- },
- src_i, offsets_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
- Window win_off;
- win_off.set(Window::DimX, window.x());
- win_off.set(Window::DimY, window.y());
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator src_i(src, win_in);
- Iterator dst_i(dst, window);
- Iterator offsets_i(offsets, win_off);
- Iterator dx_i(dx, win_off);
- Iterator dy_i(dy, win_off);
-
- const int32_t in_dim_w = src->info()->dimension(0);
- const int32_t in_dim_h = src->info()->dimension(1);
- const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
-
- if(_border_mode == BorderMode::CONSTANT)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
- const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
- const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
- const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
- && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
- && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
- const_border_value;
-
- *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- src_i, offsets_i, dx_i, dy_i, dst_i);
- }
- else if(_border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
- const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
- const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr()));
- const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr()));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
- const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
- const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
- const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
-
- *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- src_i, offsets_i, dx_i, dy_i, dst_i);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dx, dy, offsets);
- using namespace scale_helpers;
-
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
-
- // Don't increment in width/height/channels for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator src_i(src, win_in);
- Iterator dst_i(dst, window);
-
- const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
- const auto w = src->info()->dimension(0);
- const auto h = src->info()->dimension(1);
- const size_t in_stride = src->info()->strides_in_bytes()[1];
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
-
- uint8x8_t tmp0 = vdup_n_u8(0);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
- tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
-
- uint8x8_t tmp1 = vdup_n_u8(0);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
- tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
-
- vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
- },
- src_i, dst_i);
-}
-
-template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
-{
- // Get data layout and width/height indices
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
- Window win_off;
- win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(idx_width, Window::Dimension(0, 0, 0));
- win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator src_i(src, win_in);
- Iterator dst_i(dst, window);
-
- const int32_t in_dim_w = src->info()->dimension(idx_width);
- const int32_t in_dim_h = src->info()->dimension(idx_height);
- const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
- const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
- const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- if(_border_mode == BorderMode::CONSTANT)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
- *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- src_i, dst_i);
- }
- else if(_border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
- *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- src_i, dst_i);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-#endif // ENABLE_NCHW_KERNELS
-
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
- const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
- return Status{};
-}
-
-void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
- ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
- const auto dx = tensors.get_const_tensor(TensorType::ACL_INT_0);
- const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1);
- const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
-
- if(_data_layout == DataLayout::NCHW)
- {
- (this->*_func)(src, dst, dx, dy, offsets, window);
- }
- else
- {
- const auto *uk = get_implementation(ScaleSelectorData{ src->info()->data_type() });
- uk->ukernel(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
- }
-}
-
-const char *CpuScaleKernel::name() const
-{
- return "CpuScaleKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuScaleKernel.h b/src/core/cpu/kernels/CpuScaleKernel.h
deleted file mode 100644
index 24790d16d7..0000000000
--- a/src/core/cpu/kernels/CpuScaleKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H
-#define ARM_COMPUTE_CPU_SCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
-class CpuScaleKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuScaleKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel);
- /** Initialise the kernel's inputs, output and interpolation policy
- *
- * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
- * @note Using @p policy Area only supports data layout NCHW and input data type U8.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
- * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
- * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
- * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
- * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo to use for configuration
- */
- void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
- const ScaleKernelInfo &info);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuScaleKernel
- *
- * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
- * @note Using @p policy Area only supports data layout NCHW and input data type U8.
- *
- * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
- * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
- * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
- * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
- * @param[in] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
- * @param[in] info @ref ScaleKernelInfo to use for validation
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
- const ScaleKernelInfo &info);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
-#ifdef ENABLE_NCHW_KERNELS
- /** function to perform scale using area interpolation on the given window
- *
- * @note Used only in case down-sampling.
- */
- void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
- /** function to perform scale using bilinear interpolation on the given window */
- template <typename T>
- void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
- /** function to perform scale using bilinear interpolation on the given window */
- template <typename T>
- void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-
- /** function to perform scale using nearest neighbour on the given window */
- template <typename T>
- void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
-#endif // ENABLE_NCHW_KERNELS
-
- /** Scale function to use for the particular function to use */
- using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-
- ScaleFunctionPtr _func;
- InterpolationPolicy _policy;
- BorderMode _border_mode;
- PixelValue _constant_border_value;
- float _sampling_offset;
- bool _align_corners;
- DataLayout _data_layout;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_SCALEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
deleted file mode 100644
index 8ea186b16a..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/softmax/impl/neon/list.h"
-#include "src/core/cpu/kernels/softmax/impl/sve/list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SoftmaxSelectorData
-{
- DataType dt;
-};
-using SoftmaxSelectorPtr = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
-using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-struct SoftmaxLogits1DKernel
-{
- const char *name;
- const SoftmaxSelectorPtr is_selected;
- SoftmaxLogits1DKernelPtr ukernel;
-};
-
-struct SoftmaxLogits1DMaxKernel
-{
- const char *name;
- const SoftmaxSelectorPtr is_selected;
- SoftmaxLogits1DMaxKernelPtr ukernel;
-};
-
-static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "sve_softmax_logits_1d_float",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>)
- },
- {
- "sve_softmax_logits_1d_float",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
- REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
- },
-#endif /* defined(ENABLE_SVE) */
-
-#if defined(ENABLE_NEON)
- {
- "neon_softmax_logits_1d_float",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_softmax_logits_1d_float",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* !defined(ENABLE_NEON) */
-
-#if defined(__ARM_FEATURE_SVE2)
- {
- "sve_softmax_logits_1d_quantized",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>)
- },
- {
- "sve_softmax_logits_1d_quantized",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>)
- },
-#else /* !defined(__ARM_FEATURE_SVE2) */
- {
- "neon_softmax_logits_1d_quantized",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>)
- },
- {
- "neon_softmax_logits_1d_quantized",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>)
- },
-#endif /* defined(__ARM_FEATURE_SVE2) */
-
-};
-
-static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
-{
-#if defined(ENABLE_SVE)
- {
- "sve_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>)
- },
- {
- "sve_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
- REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>)
- },
- {
- "sve_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>)
- },
- {
- "sve_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
- },
-#endif /* defined(ENABLE_SVE) */
-#if defined(ENABLE_NEON)
- {
- "neon_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "neon_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>)
- },
- {
- "neon_logits_1d_max",
- [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
- },
-#endif /* defined(ENABLE_NEON) */
-};
-
-const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
-{
- for(const auto &uk : available_logits_1d_kernels)
- {
- if(uk.is_selected({ data.dt }))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
-{
- for(const auto &uk : available_logits_1d_max_kernels)
- {
- if(uk.is_selected({ data.dt }))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
- // Validate in case of configured output
- if(output.total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
- }
-
- return Status{};
-}
-
-} // namespace
-
-CpuLogits1DMaxKernel::CpuLogits1DMaxKernel()
-{
-}
-
-void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
- // Softmax across the x dimension
- const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
- // Output auto initialization if not yet initialized
- auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
- Window win = calculate_max_window(*src, Steps());
-
- ICpuKernel::configure(win);
-}
-
-Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
- return Status{};
-}
-
-void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->info()->data_type() });
- uk->ukernel(src, dst, window);
-}
-
-const char *CpuLogits1DMaxKernel::name() const
-{
- return "CpuLogits1DMaxKernel";
-}
-
-namespace
-{
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
- const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
-{
- ARM_COMPUTE_UNUSED(beta);
- // Check input
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
- // Check max
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
- // Check output if configured
- if(dst.total_size() != 0)
- {
- const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
- ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
- }
-
- // Check tmp if configured
- if(tmp.total_size() != 0)
- {
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
- ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
- // We could potentially reduce tmp memory if we could predict or make an assumption
- // on the maximum number of threads that will run in parallel.
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
- }
-
- return Status{};
-}
-} // namespace
-
-template <bool IS_LOG>
-CpuLogits1DSoftmaxKernel<IS_LOG>::CpuLogits1DSoftmaxKernel()
- : _beta(1.0f)
-{
-}
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
- _beta = beta;
-
- // Configure kernel window
- const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-
- // Output auto initialization if not yet initialized
- const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
- auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
-
- // Tmp auto initialization if not yet initialized
- const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
- auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
-
- // Configure kernel window
- Window win = calculate_max_window(*max, Steps());
-
- ICpuKernel::configure(win);
-}
-
-template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
- const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
- return Status{};
-}
-
-template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- auto max = tensors.get_tensor(TensorType::ACL_SRC_1);
- auto dst = tensors.get_tensor(TensorType::ACL_DST_0);
- auto tmp = tensors.get_tensor(TensorType::ACL_DST_1);
-
- const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
- const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
-
- ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
-
- void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
-
- const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->info()->data_type() });
- uk->ukernel(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
-}
-
-template <bool IS_LOG>
-const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
-{
- if(IS_LOG)
- {
- return "CpuLogits1DSoftmaxKernel";
- }
- else
- {
- return "CpuLogits1DLogSoftmaxKernel";
- }
-}
-
-template class CpuLogits1DSoftmaxKernel<true>;
-template class CpuLogits1DSoftmaxKernel<false>;
-
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.h b/src/core/cpu/kernels/CpuSoftmaxKernel.h
deleted file mode 100644
index aa10467965..0000000000
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SOFTMAXKERNEL_H
-#define ARM_COMPUTE_CPU_SOFTMAXKERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public ICpuKernel
-{
-public:
- /** Constructor */
- CpuLogits1DMaxKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
- /** Set the input and output tensors.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info. Data types supported: same as @p input
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DMaxKernel
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] dst Destination tensor info. Data types supported: same as @p input
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public ICpuKernel
-{
-public:
- /** Default constructor */
- CpuLogits1DSoftmaxKernel();
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
-
- /** Set the input and output tensors.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1.
- * Data types supported: same as @p input.
- * @param[out] dst Destination tensor info. Data types supported: same as @p input.
- * @param[in] beta A scaling factor for the exponent.
- *
- * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input.
- */
- void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuLogits1DSoftmaxKernel
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1.
- * Data types supported: same as @p input.
- * @param[in] dst Destination tensor info. Data types supported: same as @p input.
- * @param[in] beta A scaling factor for the exponent.
- * @param[in] tmp Tensor info of auxiliary. Must be type F32 and same shape as the input.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *max,
- const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- float _beta;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SOFTMAXKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp
deleted file mode 100644
index d7057bbe2b..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuSubKernel.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/sub/neon/list.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-struct SubSelectorData
-{
- DataType dt1;
- DataType dt2;
- DataType dt3;
-};
-
-using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type;
-using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
-
-struct SubKernel
-{
- const char *name;
- const SubSelectorPtr is_selected;
- SubKernelPtr ukernel;
-};
-
-static const SubKernel available_kernels[] =
-{
- {
- "sub_same_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
- REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
- },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "sub_same_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
- REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
- },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
- {
- "sub_same_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
- },
- {
- "sub_same_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
- },
- {
- "sub_same_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
- },
- {
- "sub_u8_s16_s16_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon)
- },
- {
- "sub_s16_u8_s16_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon)
- },
- {
- "sub_u8_u8_s16_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
- REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon)
- },
- {
- "sub_qasymm8_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
- },
- {
- "sub_qasymm8_signed_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
- },
- {
- "sub_qsymm16_neon",
- [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
- },
-};
-
-/** Micro-kernel selector
- *
- * @param[in] data Selection data passed to help pick the appropriate micro-kernel
- *
- * @return A matching micro-kernel else nullptr
- */
-const SubKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
-{
- for(const auto &uk : available_kernels)
- {
- if(uk.is_selected({ dt1, dt2, dt3 }))
- {
- return &uk;
- }
- }
- return nullptr;
-}
-
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_UNUSED(policy);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
- DataType::F32);
-
- const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
- ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
-
- const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
- && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8)
- && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED)
- && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32)
- && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32)
- && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16),
- "You called subtract with the wrong image formats");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- (src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)
- || (src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)
- || (src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),
- "Convert policy cannot be WRAP if datatype is quantized");
-
- // Validate in case of configured dst
- if(dst.total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(
- !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
- && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
- && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
- && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
- && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
- && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
- && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16),
- "You called subtract with the wrong image formats");
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
- "Wrong shape for dst");
- }
- return Status{};
-}
-} // namespace
-
-void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
-
- const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape());
-
- // Auto initialize dst if not initialized
- set_shape_if_empty(*dst, out_shape);
-
- _policy = policy;
-
- // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped
- Window win = calculate_max_window(out_shape, Steps());
-
- ICpuKernel::configure(win);
-}
-
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));
-
- return Status{};
-}
-
-void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
- const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
-
- // Dispatch kernel
- const auto *uk = get_implementation(src0->info()->data_type(), src1->info()->data_type(), dst->info()->data_type());
- uk->ukernel(src0, src1, dst, _policy, window);
-}
-
-const char *CpuSubKernel::name() const
-{
- return "CpuSubKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h
deleted file mode 100644
index da114b6e08..0000000000
--- a/src/core/cpu/kernels/CpuSubKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H
-#define ARM_COMPUTE_CPU_SUB_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Interface for the kernel to perform subtraction between two tensors */
-class CpuSubKernel : public ICpuKernel
-{
-public:
- CpuSubKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel);
-
- /** Initialise the kernel's src and dst.
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (QASYMM8, QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- *
- * @param[in] src0 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] src1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[out] dst The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
- * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
- */
- void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuSubKernel
- *
- * Valid configurations (src0,src1) -> dst :
- *
- * - (U8,U8) -> U8
- * - (U8,U8) -> S16
- * - (QASYMM8, QASYMM8) -> QASYMM8
- * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
- * - (S16,U8) -> S16
- * - (U8,S16) -> S16
- * - (S16,S16) -> S16
- * - (S32,S32) -> S32
- * - (F16,F16) -> F16
- * - (F32,F32) -> F32
- *
- * @param[in] src0 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] src1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
- * @param[in] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32.
- * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-
-private:
- ConvertPolicy _policy{};
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.cpp b/src/core/cpu/kernels/CpuTransposeKernel.cpp
deleted file mode 100644
index c7cafe94a8..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-unsigned int num_elems_processed(size_t element_size)
-{
- switch(element_size)
- {
- case 1:
- return 8;
- case 2:
- case 4:
- return 4;
- default:
- break;
- }
-
- ARM_COMPUTE_ERROR("Element size not supported");
-}
-
-void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
- const int window_step_x = 8;
- const int window_step_y = 8;
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_start_y = window.y().start();
- const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
- const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
- const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
- const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
- // Check if we need a left-over loop for the y dimension
- bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
- Window window_in(window);
- window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
- {
- // Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
- {
- window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
- }
- else
- {
- window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
- }
- }
-
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator output(out, window_out);
-
- // Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
- {
- Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 8x8 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
- const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
- const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
- const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
- const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
- const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
- const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
- const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
-
- // Transpose 2x2
- const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
- const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
- const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
- const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
-
- // Transpose 4x4
- const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
- const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
- const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
- const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
-
- // Transpose 8x8
- const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
- const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
- const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
- const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
- vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
- }
-
- // Compute left-over elements along the x dimension (1x8)
- for(; x < window_end_x; ++x)
- {
- const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
- const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
- const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
- const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
- const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
- const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
- const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
- const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
-
- uint8x8_t result = vdup_n_u8(0);
- result = vset_lane_u8(val0, result, 0);
- result = vset_lane_u8(val1, result, 1);
- result = vset_lane_u8(val2, result, 2);
- result = vset_lane_u8(val3, result, 3);
- result = vset_lane_u8(val4, result, 4);
- result = vset_lane_u8(val5, result, 5);
- result = vset_lane_u8(val6, result, 6);
- result = vset_lane_u8(val7, result, 7);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
-
- vst1_u8(output.ptr() + dst_offset_in_bytes, result);
- }
- },
- input, output);
- }
-
- if(left_over_loop_y)
- {
- window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
- window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
- Iterator input(in, window_in);
- Iterator output(out, window_out);
-
- // Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint8_t val0 = *input.ptr();
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
-
- *(output.ptr() + dst_offset_in_bytes) = val0;
- },
- input, output);
- }
-}
-
-void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
- const int window_step_x = 4;
- const int window_step_y = 4;
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_start_y = window.y().start();
- const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
- const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
- const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
- const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
- // Check if we need a left-over loop for the y dimension
- bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
- Window window_in(window);
- window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
- {
- // Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
- {
- window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
- }
- else
- {
- window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
- }
- }
-
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator output(out, window_out);
-
- // Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
- {
- Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 4x4 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- // Transpose 2x2
- const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
- const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
-
- // Transpose 4x4
- const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
- const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
- }
-
- // Compute left-over elements (1x4)
- for(; x < window_end_x; ++x)
- {
- const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- uint16x4_t result = vdup_n_u16(0);
- result = vset_lane_u16(val0, result, 0);
- result = vset_lane_u16(val1, result, 1);
- result = vset_lane_u16(val2, result, 2);
- result = vset_lane_u16(val3, result, 3);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
-
- vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
- }
- },
- input, output);
- }
-
- if(left_over_loop_y)
- {
- window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
- window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
- Iterator input(in, window_in);
- Iterator output(out, window_out);
-
- // Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
-
- *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
- },
- input, output);
- }
-}
-
-void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
-{
- const int window_step_x = 4;
- const int window_step_y = 4;
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_start_y = window.y().start();
- const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
- const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
- const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
- const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
-
- // Check if we need a left-over loop for the y dimension
- bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
-
- Window window_in(window);
- window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
- if(left_over_loop_y)
- {
- // Check if window_end_y_multiple_of is greater than window_start_y
- if(window_end_y_multiple_of > window_start_y)
- {
- window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
- }
- else
- {
- window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
- }
- }
-
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator output(out, window_out);
-
- // Run the SIMD path if and only if the input is not a row-vector
- if(in->info()->dimension(1) != 1)
- {
- Iterator input(in, window_in);
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- // Compute 4x4 elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- // Transpose 2x2
- const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
- const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
- const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
- const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- // Swap block 01 with block 10 and store
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
- }
-
- // Compute left-over elements (1x4)
- for(; x < window_end_x; ++x)
- {
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
- const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
- const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
- const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-
- uint32x4_t result = vdupq_n_u32(0);
- result = vsetq_lane_u32(val0, result, 0);
- result = vsetq_lane_u32(val1, result, 1);
- result = vsetq_lane_u32(val2, result, 2);
- result = vsetq_lane_u32(val3, result, 3);
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
-
- vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
- }
- },
- input, output);
- }
-
- if(left_over_loop_y)
- {
- window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
- window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
-
- Iterator input(in, window_in);
- Iterator output(out, window_out);
-
- // Compute left-over elements along the y dimension (1x1)
- execute_window_loop(window_in, [&](const Coordinates & id)
- {
- const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
-
- // Compute destination address
- const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
-
- *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
- },
- input, output);
- }
-}
-} // namespace
-
-void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // Destination auto inizialitation if not yet initialized
- const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
-
- // Note: This kernel performs 16 elements per iteration.
- // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
- // For this reason num_elems_processed_per_iteration_x is set to 1
- const unsigned int num_elems_processed_per_iteration_x = 1;
- const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
-
- // Configure kernel window
- Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
- Coordinates coord;
- coord.set_num_dimensions(dst->num_dimensions());
- dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
-
- ICpuKernel::configure(win);
-}
-
-Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
- //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
-
- // Error if input is not 8 bit, 16bit or 32bit
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,
- "Element size not supported");
-
- // Validate configured destination
- if(dst->total_size() != 0)
- {
- const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
- }
-
- return Status{};
-}
-
-void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-
- const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
- auto dst = tensors.get_tensor(TensorType::ACL_DST);
-
- switch(src->info()->element_size())
- {
- case 1:
- transpose_8bit_elements(src, dst, window);
- break;
- case 2:
- transpose_16bit_elements(src, dst, window);
- break;
- case 4:
- transpose_32bit_elements(src, dst, window);
- break;
- default:
- ARM_COMPUTE_ERROR("Element size not supported");
- break;
- }
-}
-
-const char *CpuTransposeKernel::name() const
-{
- return "CpuTransposeKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuTransposeKernel.h b/src/core/cpu/kernels/CpuTransposeKernel.h
deleted file mode 100644
index f09f427be8..0000000000
--- a/src/core/cpu/kernels/CpuTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H
-
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** Kernel which transposes the elements of a matrix */
-class CpuTransposeKernel : public ICpuKernel
-{
-public:
- CpuTransposeKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel);
- /** Configure kernel for a given list of arguments
- *
- * @param[in] src Srouce tensor to permute. Data types supported: All
- * @param[out] dst Destination tensor. Data types supported: Same as @p src
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst);
- /** Static function to check if given info will lead to a valid configuration of @ref CpuTransposeKernel
- *
- * @param[in] src Source tensor to permute. Data types supported: All
- * @param[in] dst Destination tensor. Data types supported: Same as @p src
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
- const char *name() const override;
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */
diff --git a/src/core/cpu/kernels/activation/list.h b/src/core/cpu/kernels/activation/list.h
deleted file mode 100644
index 409d025db0..0000000000
--- a/src/core/cpu/kernels/activation/list.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ACTIVATION_KERNEL(func_name) \
- void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-
-DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp16_sve_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_neon_activation);
-DECLARE_ACTIVATION_KERNEL(fp32_sve_activation);
-
-#undef DECLARE_ACTIVATION_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */
diff --git a/src/core/cpu/kernels/activation/neon/fp16.cpp b/src/core/cpu/kernels/activation/neon/fp16.cpp
deleted file mode 100644
index 6f2d5d8533..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
- auto int_in = vreinterpretq_u16_f16(in);
- return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- constexpr int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- // In case of non-aarch64, a small delta value is added to the input
- // to prevent NAN values caused by zeros in inputs to SQRT.
- // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
- const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
- const auto const_1 = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
- const auto const_0 = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
- const auto const_6 = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
- const auto const_3 = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
- const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
- constexpr float soft_relu_thresh = 12.f;
- const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
- const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
- const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
- const auto a = static_cast<float16_t>(act_info.a());
- const auto b = static_cast<float16_t>(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = wrapper::vabs(vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = wrapper::vmla(vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = wrapper::vmax(const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
- tmp = wrapper::vsqrt(vin);
-#else /* __aarch64__ */
- {
- const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
- tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
- tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
- }
-#endif /* __aarch64__ */
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = wrapper::vmul(vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
- float16_t tmp;
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = std::abs(in);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = a * in + b;
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = (in > 0) ? in : a * in;
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = std::sqrt(in);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = in * in;
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = a * std::tanh(b * in);
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = in;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/activation/neon/fp32.cpp b/src/core/cpu/kernels/activation/neon/fp32.cpp
deleted file mode 100644
index 54301d45ad..0000000000
--- a/src/core/cpu/kernels/activation/neon/fp32.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
-{
- auto int_in = vreinterpretq_u32_f32(in);
- return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 4;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- // In case of non-aarch64, a small delta value is added to the input
- // to prevent NAN values caused by zeros in inputs to SQRT.
- // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
- const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
-#endif /* __aarch64__ */
- const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
- const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
- const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
- const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
- const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
- constexpr float soft_relu_thresh = 12.f;
- const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
- const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
- const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
- const auto a = static_cast<float>(act_info.a());
- const auto b = static_cast<float>(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = wrapper::vabs(vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = wrapper::vmla(vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = wrapper::vmax(const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
- tmp = wrapper::vsqrt(vin);
-#else /* __aarch64__ */
- {
- const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
- tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
- tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
- }
-#endif /* __aarch64__ */
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = wrapper::vmul(vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float in = *(reinterpret_cast<const float *>(input_ptr + x));
- float tmp;
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = std::abs(in);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = a * in + b;
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = std::max<float>(static_cast<float>(0), in);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = std::min<float>(a, std::max<float>(b, in));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = (in > 0) ? in : a * in;
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = std::sqrt(in);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = in * in;
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = a * std::tanh(b * in);
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = in;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8.cpp b/src/core/cpu/kernels/activation/neon/qasymm8.cpp
deleted file mode 100644
index a1217435b6..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- constexpr int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
- const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
- const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
- const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
- const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
- const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
- const auto vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
- const auto vconst_0_f32 = vdupq_n_f32(0);
-#endif // __aarch64__
- const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
- const float a_f32 = act_info.a();
- const float b_f32 = act_info.b();
- const auto const_6_f32 = vdupq_n_f32(6.f);
- const auto const_0_f32 = vdupq_n_f32(0.f);
- const auto const_3_f32 = vdupq_n_f32(3.f);
- const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
- // Initialise scale/offset for re-quantization
- float s = qi_in.scale / qi_out.scale;
- float o = -qi_in.offset * s + qi_out.offset;
- float32x4_t vs = vdupq_n_f32(s);
- float32x4_t vo = vdupq_n_f32(o);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = vmaxq_u8(vconst_0, vin);
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_u8(va, vmaxq_u8(vb, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
- wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
- wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
- wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
- wrapper::vcgtz(vin_deq.val[0]),
- wrapper::vcgtz(vin_deq.val[1]),
- wrapper::vcgtz(vin_deq.val[2]),
- wrapper::vcgtz(vin_deq.val[3]),
- }
- };
-#else // __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
- wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
- }
- };
-#endif // __aarch64__
-
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
- wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
- wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
- wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
- }
- };
-
- tmp = vquantize(tmp_dep, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
- qasymm8_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- tmp = std::max(const_0, in);
- tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(const_0, in));
- tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(b, in));
- tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- float tmp_f = dequantize_qasymm8(in, qi_in);
- tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
- tmp = quantize_qasymm8(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
deleted file mode 100644
index 8b40bf8e72..0000000000
--- a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- constexpr int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
- const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
- const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in);
- const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in);
- const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in);
- const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0);
- const auto vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
- const auto vconst_0_f32 = vdupq_n_f32(1.f);
-#endif // __aarch64__
- const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
- const float a_f32 = act_info.a();
- const float b_f32 = act_info.b();
- const auto const_6_f32 = vdupq_n_f32(6.f);
- const auto const_0_f32 = vdupq_n_f32(0.f);
- const auto const_3_f32 = vdupq_n_f32(3.f);
- const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
- // Initialise scale/offset for re-quantization
- float s = qi_in.scale / qi_out.scale;
- float o = -qi_in.offset * s + qi_out.offset;
- float32x4_t vs = vdupq_n_f32(s);
- float32x4_t vo = vdupq_n_f32(o);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = vmaxq_s8(vconst_0, vin);
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = vminq_s8(va, vmaxq_s8(vb, vin));
- // Re-quantize to new output space
- tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = vdequantize(vin, qi_in);
- // Perform activation
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
- wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
- wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
- wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
- wrapper::vcgtz(vin_deq.val[0]),
- wrapper::vcgtz(vin_deq.val[1]),
- wrapper::vcgtz(vin_deq.val[2]),
- wrapper::vcgtz(vin_deq.val[3]),
- }
- };
-#else // __aarch64__
- const uint32x4x4_t pos_mask =
- {
- {
- wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
- wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
- }
- };
-#endif // __aarch64__
-
- const float32x4x4_t tmp_dep =
- {
- {
- wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
- wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
- wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
- wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
- }
- };
-
- tmp = vquantize_signed(tmp_dep, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
- qasymm8_signed_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- tmp = std::max(const_0, in);
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(const_0, in));
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- tmp = std::min(a, std::max(b, in));
- tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- float tmp_f = dequantize_qasymm8_signed(in, qi_in);
- tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
- tmp = quantize_qasymm8_signed(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qsymm16.cpp b/src/core/cpu/kernels/activation/neon/qsymm16.cpp
deleted file mode 100644
index 54b41820f2..0000000000
--- a/src/core/cpu/kernels/activation/neon/qsymm16.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- constexpr int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const auto vconst_1 = vdupq_n_f32(1.f);
- const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
- const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
- const float a_f32 = act_info.a();
- const float b_f32 = act_info.b();
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
-
- wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
- ARM_COMPUTE_UNUSED(tmp);
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin = wrapper::vloadq(input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep =
- {
- {
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
- wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
- // Perform activation
- const float32x4x2_t tmp_dep =
- {
- {
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
- wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
- }
- };
- // Re-quantize to new output space
- tmp = vquantize_int16(tmp_dep, qi_out.scale);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- wrapper::vstore(output_ptr + x, tmp);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
- qsymm16_t tmp = 0;
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = 1.f / (1.f + std::exp(-tmp_f));
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- float tmp_f = dequantize_qsymm16(in, qi_in.scale);
- tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
- tmp = quantize_qsymm16(tmp_f, qi_out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- *(output_ptr + x) = tmp;
- }
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
deleted file mode 100644
index 5e76e82c52..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const auto const_1 = svdup_n_f16(1.f);
- const auto const_0 = svdup_n_f16(0.f);
- const auto const_6 = svdup_n_f16(6.f);
- const auto const_3 = svdup_n_f16(3.f);
- const auto const_inv_6 = svdup_n_f16(0.166666667f);
-
- const auto va = svdup_n_f16(act_info.a());
- const auto vb = svdup_n_f16(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- svfloat16_t tmp;
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- const auto vin = svld1_f16(pg, input_ptr + x);
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = svabs_f16_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = svmla_f16_z(pg, vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = svmax_f16_z(pg, const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = svsqrt_f16_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = svmul_f16_z(pg, vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- svst1_f16(pg, output_ptr + x, tmp);
-
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
-
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
deleted file mode 100644
index cb9f82eb39..0000000000
--- a/src/core/cpu/kernels/activation/sve/fp32.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/SVEMath.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const auto const_1 = svdup_n_f32(1.f);
- const auto const_0 = svdup_n_f32(0.f);
- const auto const_6 = svdup_n_f32(6.f);
- const auto const_3 = svdup_n_f32(3.f);
- const auto const_inv_6 = svdup_n_f32(0.166666667f);
-
- const auto va = svdup_n_f32(act_info.a());
- const auto vb = svdup_n_f32(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- svfloat32_t tmp;
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
- {
- const auto vin = svld1_f32(pg, input_ptr + x);
- switch(act)
- {
- case ActivationLayerInfo::ActivationFunction::ABS:
- tmp = svabs_f32_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LINEAR:
- tmp = svmla_f32_z(pg, vb, va, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::LOGISTIC:
- tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
- break;
- case ActivationLayerInfo::ActivationFunction::RELU:
- tmp = svmax_f32_z(pg, const_0, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
- break;
- case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
- tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
- break;
- case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
- tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::ELU:
- tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
- break;
- case ActivationLayerInfo::ActivationFunction::SQRT:
- tmp = svsqrt_f32_z(pg, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::SQUARE:
- tmp = svmul_f32_z(pg, vin, vin);
- break;
- case ActivationLayerInfo::ActivationFunction::TANH:
- tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
- break;
- case ActivationLayerInfo::ActivationFunction::IDENTITY:
- tmp = vin;
- break;
- case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
- tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
- break;
- default:
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
- svst1_f32(pg, output_ptr + x, tmp);
-
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
-
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8.cpp b/src/core/cpu/kernels/activation/sve/qasymm8.cpp
deleted file mode 100644
index 228b4ae530..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
- const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
- const auto const_0 = quantize_qasymm8(0.f, qi_in);
- const auto vconst_0 = svdup_n_u8(const_0);
- const auto vconst_1 = svdup_n_f32(1.f);
- const auto va_f32 = svdup_n_f32(act_info.a());
- const auto vb_f32 = svdup_n_f32(act_info.b());
- const auto const_6_f32 = svdup_n_f32(6.f);
- const auto const_0_f32 = svdup_n_f32(0.f);
- const auto const_3_f32 = svdup_n_f32(3.f);
- const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
- // Initialise scale/offset for re-quantization
- bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
- {
- requant = false;
- }
- float s = qi_in.scale / qi_out.scale;
- float o = -qi_in.offset * s + qi_out.offset;
- auto vs = svdup_n_f32(s);
- auto vo = svdup_n_f32(o);
-
- // Initialise scale/offset for re-quantization with int32_t
- const auto voffset_in = svdup_n_s32(qi_in.offset);
- int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- const auto vs_s32 = svdup_n_s32(s_s32);
- const auto vo_s32 = svdup_n_s32(o_s32);
-
- // Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
- const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- svuint8_t tmp;
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_u8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_u8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 =
- {
- { {
- svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
- svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
- svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
- }
- }
- };
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
- {
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
- }
- else
- {
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
- }
-
- // Multiply negative elements and requantize if necessary
- if(requant)
- {
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
- }
- else
- {
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
- }
-
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
-
- svst1_u8(pg, output_ptr + x, tmp);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
-
- }
- while(svptest_any(svptrue_b8(), pg));
-
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
deleted file mode 100644
index 989f825eb9..0000000000
--- a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
- const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
- const auto const_0 = quantize_qasymm8_signed(0.f, qi_in);
- const auto vconst_0 = svdup_n_s8(const_0);
- const auto vconst_1 = svdup_n_f32(1.f);
- const auto va_f32 = svdup_n_f32(act_info.a());
- const auto vb_f32 = svdup_n_f32(act_info.b());
- const auto const_6_f32 = svdup_n_f32(6.f);
- const auto const_0_f32 = svdup_n_f32(0.f);
- const auto const_3_f32 = svdup_n_f32(3.f);
- const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
- // Initialise scale/offset for re-quantization
- bool requant = true;
- if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
- {
- requant = false;
- }
- float s = qi_in.scale / qi_out.scale;
- float o = -qi_in.offset * s + qi_out.offset;
- auto vs = svdup_n_f32(s);
- auto vo = svdup_n_f32(o);
-
- // Initialise scale/offset for re-quantization with int32_t
- const auto voffset_in = svdup_n_s32(qi_in.offset);
- int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- const auto vs_s32 = svdup_n_s32(s_s32);
- const auto vo_s32 = svdup_n_s32(o_s32);
-
- // Initialise scale/offset for re-quantization for leaky relu
- int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
- arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
- const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
- const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- svint8_t tmp;
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto vin = svld1_s8(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::RELU)
- {
- // Perform activation
- tmp = svmax_s8_z(pg, vconst_0, vin);
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
- {
- // Perform activation
- tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
- // Re-quantize to new output space
- tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
- {
- // De-quantize
- const auto vin_deq = svdequantize_z(pg, vin, qi_in);
- // Perform activation
- const svfloat32x4_t tmp_dep =
- {
- { {
- svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
- svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
- {
- svbool_t p0, p1, p2, p3;
- svint32x4_t tmp_dep;
-
- // Expand to int32
- const svint32x4_t vin_s32 =
- {
- { {
- svmovlb_s32(svmovlb_s16(vin)),
- svmovlt_s32(svmovlb_s16(vin)),
- svmovlb_s32(svmovlt_s16(vin)),
- svmovlt_s32(svmovlt_s16(vin)),
- }
- }
- };
-
- // Compare elements to input offset
- if(qi_in.scale >= 0)
- {
- p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
- }
- else
- {
- p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
- p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
- p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
- p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
- }
-
- // Multiply negative elements and requantize if necessary
- if(requant)
- {
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
- svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
- }
- else
- {
- tmp_dep = svcreate4_s32(
- svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
- svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
- }
-
- // Convert uint32 vectors to uint16 vectors (with saturation)
- const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
- const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
- // convert uint16 vectors to uint8 vectors (with saturation)
- tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
-
- svst1_s8(pg, output_ptr + x, tmp);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
-
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/sve/qsymm16.cpp b/src/core/cpu/kernels/activation/sve/qsymm16.cpp
deleted file mode 100644
index 66974875da..0000000000
--- a/src/core/cpu/kernels/activation/sve/qsymm16.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/SVESymm.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
- Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
- win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(src, win_collapsed);
- Iterator output(dst, win_collapsed);
-
- const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
- const auto vconst_1 = svdup_n_f32(1.f);
- const auto va_f32 = svdup_n_f32(act_info.a());
- const auto vb_f32 = svdup_n_f32(act_info.b());
-
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- svint16_t tmp;
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- const auto vin = svld1_s16(pg, input_ptr + x);
- if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
- {
- // De-quantize
- auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
- // Perform activation
- const svfloat32x2_t tmp_dep =
- {
- { {
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
- svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
- }
- else if(act == ActivationLayerInfo::ActivationFunction::TANH)
- {
- // De-quantize
- auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
- // Perform activation
- const svfloat32x2_t tmp_dep =
- {
- { {
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
- svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
- }
- }
- };
- // Re-quantize to new output space
- tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
- }
- else
- {
- ARM_COMPUTE_ERROR("Unsupported activation function");
- }
-
- svst1_s16(pg, output_ptr + x, tmp);
-
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
-
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/add/neon/integer.cpp b/src/core/cpu/kernels/add/neon/integer.cpp
deleted file mode 100644
index 24a0ac3b7c..0000000000
--- a/src/core/cpu/kernels/add/neon/integer.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
- }
- }
- else
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
- static_cast<int16_t>(*(input2_ptr + x)));
- }
- }
- },
- input1, input2, output);
-}
-
-void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = wrapper::vloadq(input1_ptr + x);
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
- }
- }
- else
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = wrapper::vloadq(input1_ptr + x);
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
- }
- }
- },
- input1, input2, output);
-}
-
-void add_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Simply swap the two input buffers:
- add_s16_u8_s16_neon(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h
deleted file mode 100644
index 3ab03dd40e..0000000000
--- a/src/core/cpu/kernels/add/neon/list.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H
-#define SRC_CORE_NEON_KERNELS_ADD_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
- void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_neon);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
-DECLARE_ADD_KERNEL(add_qsymm16_neon);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_neon);
-
-#undef DECLARE_ADD_KERNEL
-
-template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- constexpr int window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto val1 = wrapper::vloadq(input1_ptr + x);
- const auto val2 = wrapper::vloadq(input2_ptr + x);
- const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto val1 = *(input1_ptr + x);
- const auto val2 = *(input2_ptr + x);
- *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H
diff --git a/src/core/cpu/kernels/add/neon/qasymm8.cpp b/src/core/cpu/kernels/add/neon/qasymm8.cpp
deleted file mode 100644
index e357a7ef7f..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
- const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
- const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
- const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
- const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
- const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
-
- const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
- const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
- const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
-
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
- const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const uint8x16_t a = vld1q_u8(input1_ptr + x);
- const uint8x16_t b = vld1q_u8(input2_ptr + x);
-
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
- const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
- const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
- const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
- const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
-
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
- *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
deleted file mode 100644
index d62d0739f5..0000000000
--- a/src/core/cpu/kernels/add/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
- const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
- const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
- const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
- const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
- const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
- const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
-
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
- const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
- const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
-
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
- const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
- const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int8x16_t a = vld1q_s8(input1_ptr + x);
- const int8x16_t b = vld1q_s8(input2_ptr + x);
-
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
- const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
- const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
- const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
- const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
- int32x4_t rf_2{};
- int32x4_t rf_3{};
-
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
- rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
- rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
-#endif //__aarch64__
-
- const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
- const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
- vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
- *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/qsymm16.cpp b/src/core/cpu/kernels/add/neon/qsymm16.cpp
deleted file mode 100644
index e76e408d6e..0000000000
--- a/src/core/cpu/kernels/add/neon/qsymm16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
- const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8_t a = vld1q_s16(input1_ptr + x);
- const int16x8_t b = vld1q_s16(input2_ptr + x);
-
- const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
- const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
- const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
- const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
-
- int32x4_t rf_0{};
- int32x4_t rf_1{};
-#ifdef __aarch64__
- rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#else //__aarch64__
- rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
- rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
-#endif //__aarch64__
-
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
- *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp
deleted file mode 100644
index cf9e301c29..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- const auto all_true_pg = wrapper::svptrue<ScalarType>();
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
- const bool is_sat = (policy == ConvertPolicy::SATURATE);
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
- Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
- Iterator output(dst, window);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
- svst1(pg, output_ptr + x, res);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto val1 = svld1(pg, input1_ptr + x);
- const auto val2 = svld1(pg, input2_ptr + x);
- const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
- svst1(pg, output_ptr + x, res);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
- }
-}
-
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h
deleted file mode 100644
index c38b1d47e0..0000000000
--- a/src/core/cpu/kernels/add/sve/impl.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
deleted file mode 100644
index bd8179205b..0000000000
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const auto all_true_pg = svptrue_b8();
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- int x = window_start_x;
- svbool_t pg_u = svwhilelt_b8(x, window_end_x);
- svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
- svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
- do
- {
- const auto vsrc0 = svld1(pg_u, input1_ptr + x);
- const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
- const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
- const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
- const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
- const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
- svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
- svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
- x += svcntb();
- pg_u = svwhilelt_b8(x, window_end_x);
- pg_0 = svwhilelt_b16(x, window_end_x);
- pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
- }
- while(svptest_any(all_true_pg, pg_u));
- }
- else
- {
- int x = window_start_x;
- svbool_t pg_u = svwhilelt_b8(x, window_end_x);
- svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
- svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
- do
- {
- const auto vsrc0 = svld1(pg_u, input1_ptr + x);
- const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
- const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
- const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
- const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
- const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
- svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
- svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
- x += svcntb();
- pg_u = svwhilelt_b8(x, window_end_x);
- pg_0 = svwhilelt_b16(x, window_end_x);
- pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
- }
- while(svptest_any(all_true_pg, pg_u));
- }
- },
- input1, input2, output);
-}
-
-void add_s16_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const auto all_true_pg = svptrue_b8();
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- int x = window_start_x;
- svbool_t pg_u = svwhilelt_b8(x, window_end_x);
- svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
- svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
- do
- {
- const auto vsrc0_0 = svld1_s16(pg_0, input1_ptr + x);
- const auto vsrc0_1 = svld1_s16(pg_1, input1_ptr + x + svcnth());
- const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
- const auto vsrc1_0 = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
- const auto vsrc1_1 = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
- svst1_s16(pg_0, output_ptr + x, svadd_s16_z(pg_0, vsrc0_0, vsrc1_0));
- svst1_s16(pg_1, output_ptr + x + svcnth(), svadd_s16_z(pg_1, vsrc0_1, vsrc1_1));
-
- x += svcntb();
- pg_u = svwhilelt_b8(x, window_end_x);
- pg_0 = svwhilelt_b16(x, window_end_x);
- pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
- }
- while(svptest_any(all_true_pg, pg_u));
- }
- else
- {
- int x = window_start_x;
- svbool_t pg_u = svwhilelt_b8(x, window_end_x);
- svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
- svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
- do
- {
- const auto vsrc0_0 = svld1_s16(pg_0, input1_ptr + x);
- const auto vsrc0_1 = svld1_s16(pg_1, input1_ptr + x + svcnth());
- const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
- const auto vsrc1_0 = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
- const auto vsrc1_1 = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
-
- svst1_s16(pg_0, output_ptr + x, svqadd(vsrc0_0, vsrc1_0));
- svst1_s16(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_1, vsrc1_1));
-
- x += svcntb();
- pg_u = svwhilelt_b8(x, window_end_x);
- pg_0 = svwhilelt_b16(x, window_end_x);
- pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
- }
- while(svptest_any(all_true_pg, pg_u));
- }
- },
- input1, input2, output);
-}
-
-void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Simply swap the two input buffers:
- add_s16_u8_s16_sve(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
deleted file mode 100644
index aebb43bb60..0000000000
--- a/src/core/cpu/kernels/add/sve/list.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
-#define SRC_CORE_SVE_KERNELS_ADD_LIST_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/add/sve/impl.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_ADD_KERNEL(func_name) \
- void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_ADD_KERNEL(add_qasymm8_sve);
-DECLARE_ADD_KERNEL(add_qasymm8_signed_sve);
-DECLARE_ADD_KERNEL(add_qsymm16_sve);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_sve);
-
-#undef DECLARE_ADD_KERNEL
-
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8.cpp b/src/core/cpu/kernels/add/sve/qasymm8.cpp
deleted file mode 100644
index f6d1485e61..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
- const auto all_true_pg = svptrue_b8();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
- const auto voffseto = svdup_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
- const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
- const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
- const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
- const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-
- do
- {
- const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
- const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
- const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-
- const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
- svst1_u8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const auto vscale1 = svdup_n_f32(iq1_info.scale);
- const auto vscale2 = svdup_n_f32(iq2_info.scale);
- const auto voffset1 = svdup_n_s32(iq1_info.offset);
- const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto a = svld1_u8(pg, input1_ptr + x);
- const auto b = svld1_u8(pg, input2_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
-
- const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
- const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
- const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-
- svst1_u8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
deleted file mode 100644
index 8102aa5c65..0000000000
--- a/src/core/cpu/kernels/add/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
- const auto voffseto = svdup_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const auto all_true_pg = svptrue_b8();
-
- const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
- const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
- const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
- const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
- do
- {
- const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
- const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
- svst1_s8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const auto vscale1 = svdup_n_f32(iq1_info.scale);
- const auto vscale2 = svdup_n_f32(iq2_info.scale);
- const auto voffset1 = svdup_n_s32(iq1_info.offset);
- const auto voffset2 = svdup_n_s32(iq2_info.offset);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- const auto a = svld1_s8(pg, input1_ptr + x);
- const auto b = svld1_s8(pg, input2_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
- const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
- const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
- const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
- const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
- const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
- const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
-
- const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
- const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
-
- svst1_s8(pg, output_ptr + x, res);
-
- x += svcntb();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/qsymm16.cpp b/src/core/cpu/kernels/add/sve/qsymm16.cpp
deleted file mode 100644
index fb62257b0a..0000000000
--- a/src/core/cpu/kernels/add/sve/qsymm16.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE2)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const auto vscale1 = svdup_n_f32(iq1_info.scale);
- const auto vscale2 = svdup_n_f32(iq2_info.scale);
- const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
- const auto all_true_pg = svptrue_b16();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = svdup_n_s16(broadcast_value);
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
-
- do
- {
- const auto a = svld1_s16(pg, non_broadcast_input_ptr + x);
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
- const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-
- svst1_s16(pg, output_ptr + x, res);
-
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- auto a = svld1_s16(pg, input1_ptr + x);
- auto b = svld1_s16(pg, input2_ptr + x);
-
- const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
- const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
-
- const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
- const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
-
- const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
- const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-
- const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
- svst1_s16(pg, output_ptr + x, res);
-
- x += svcnth();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
deleted file mode 100644
index 4b7b092d01..0000000000
--- a/src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
-
-#include "gemm_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-namespace cpu
-{
-namespace kernel
-{
-/** This class is a wrapper for the assembly kernels.
- *
- * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
- * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
- * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
- * in the context of an NEFunctions.
- *
- * The type T is the type of the actual kernel implemented in assembly which is of type
- * template<typename To, typename Tr> class GemmCommon
- *
- *
- */
-template <typename TypeInput, typename TypeOutput>
-class CpuGemmAssemblyWrapperKernel final : public INEKernel
-{
-public:
- /** Constructor
- */
- CpuGemmAssemblyWrapperKernel()
- : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
- {
- }
-
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete;
- CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
- CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
-
- const char *name() const override
- {
- return _name.c_str();
- }
-
- void run(const Window &window, const ThreadInfo &info) override
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
- auto win = arm_gemm::to_ndcoord(window);
-
- arm_gemm::ndcoord_t thread_locator{};
-
- _kernel->execute(win, thread_locator, info.thread_id);
- }
-
- // Inherited methods overridden:
- void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
- //convert between arm_compute and arm_gemm types
- auto ndc_win = arm_gemm::to_ndcoord(window);
- auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
-
- _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
- }
-
- /** Initialise the kernel's input and output.
- *
- * @param[in] kernel Pointer to an assembly kernel implementation.
- * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name.
- */
- void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
- _kernel = kernel;
-
- Window win = to_window(kernel->get_window_size());
-
- INEKernel::configure(win);
-
- if(!kernel_name_tag.empty())
- {
- _name += "/" + kernel_name_tag;
- }
- }
-
-private:
- arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
- std::string _name;
-};
-} // namespace kernel
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/assembly/arm_gemm.hpp b/src/core/cpu/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index 81e355d6b3..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm.hpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm
-{
-enum class GemmMethod
-{
- DEFAULT,
- GEMV_BATCHED,
- GEMV_PRETRANSPOSED,
- GEMV_NATIVE_TRANSPOSED,
- GEMM_NATIVE,
- GEMM_HYBRID,
- GEMM_INTERLEAVED,
- GEMM_INTERLEAVED_2D,
- QUANTIZE_WRAPPER,
- QUANTIZE_WRAPPER_2D,
- GEMM_HYBRID_QUANTIZED,
- INDIRECT_GEMM,
- CONVOLUTION_GEMM
-};
-
-struct KernelDescription
-{
- GemmMethod method = GemmMethod::DEFAULT;
- std::string name = "";
- bool is_default = false;
- uint64_t cycle_estimate = 0;
-
- KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
- : method(m), name(n), is_default(d), cycle_estimate(c)
- {
- }
- KernelDescription() noexcept
- {
- }
-};
-
-struct GemmConfig
-{
- GemmMethod method = GemmMethod::DEFAULT;
- std::string filter = "";
- unsigned int inner_block_size = 0;
- unsigned int outer_block_size = 0;
-
- GemmConfig(GemmMethod method)
- : method(method)
- {
- }
- GemmConfig()
- {
- }
-};
-
-struct Activation
-{
- enum class Type
- {
- None,
- ReLU,
- BoundedReLU
- };
-
- Type type;
- float param1;
- float param2;
-
- Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
- : type(type), param1(p1), param2(p2)
- {
- }
-};
-
-struct GemmArgs
-{
-public:
- const CPUInfo *_ci;
- unsigned int _Msize;
- unsigned int _Nsize;
- unsigned int _Ksize;
- unsigned int _Ksections;
- unsigned int _nbatches;
- unsigned int _nmulti;
- bool _indirect_input;
- Activation _act;
- int _maxthreads;
- const GemmConfig *_cfg;
-
- GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
- unsigned int K, unsigned int Ksections, unsigned int nbatches,
- unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
- const GemmConfig *cfg = nullptr)
- : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
- {
- }
-};
-
-struct Requantize32
-{
-public:
- const int32_t *bias = nullptr;
- size_t bias_multi_stride = 0;
- int32_t a_offset = 0;
- int32_t b_offset = 0;
- int32_t c_offset = 0;
- bool per_channel_requant = false;
- int32_t per_layer_left_shift = 0;
- int32_t per_layer_right_shift = 0;
- int32_t per_layer_mul = 0;
- const int32_t *per_channel_left_shifts = nullptr;
- const int32_t *per_channel_right_shifts = nullptr;
- const int32_t *per_channel_muls = nullptr;
- int32_t minval = 0;
- int32_t maxval = 0;
-
- Requantize32() = default;
-
- // Constructor for per-tensor quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
- int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
- per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
- {
- }
-
- // Constructor for per-channel quantization
- Requantize32(const int32_t *bias, size_t bias_multi_stride,
- int32_t a_offset, int32_t b_offset, int32_t c_offset,
- const int32_t *requant_left_shifts,
- const int32_t *requant_right_shifts,
- const int32_t *requant_muls,
- int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
- per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
- {
- }
-};
-
-struct Nothing
-{
-};
-
-template <typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM? */
-template <typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
-
-template <typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
deleted file mode 100644
index 718fcd1fb4..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/Window.h"
-
-#include "ndrange.hpp"
-
-#include <cassert>
-
-/* This file contains mapping between integral types used in arm_compute and arm_gemm
- * These two codebases both require a degree of separation for the sake of modularity
- * so maintain their own types which represent similar information.
- */
-
-namespace arm_gemm
-{
-//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
- arm_compute::Dimensions<unsigned int>::num_max_dimensions;
-
-using ndrange_t = NDRange<ndrange_max>;
-using ndcoord_t = NDCoordinate<ndrange_max>;
-
-/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
- *
- * As `NDRange<T>` does not not encode start positions, we specify
- * the start to be zero in the produced `arm_compute::Window`
- *
- * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
- */
-inline arm_compute::Window to_window(const ndrange_t &ndr)
-{
- arm_compute::Window win;
-
- for(unsigned int i = 0; i != ndrange_max; ++i)
- {
- //populate the window with the dimensions of the NDRange
- win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
- }
-
- return win;
-}
-
-/*
- * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
- *
- * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
- */
-inline arm_compute::Window to_window(const ndcoord_t &ndc)
-{
- arm_compute::Window win;
-
- for(unsigned int i = 0; i != ndrange_max; ++i)
- {
- const auto start = ndc.get_position(i);
- const auto size = ndc.get_size(i);
- const auto stop = start + size;
-
- //populate the window with the dimensions of the NDRange
- win.set(i, arm_compute::Window::Dimension(start, stop));
- }
-
- return win;
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
- *
- * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
- * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
- * @return the resultant ndrange_t
- */
-inline ndrange_t to_ndrange(const arm_compute::Window &win)
-{
- return
- {
- static_cast<unsigned int>(win[0].end() - win[0].start()),
- static_cast<unsigned int>(win[1].end() - win[1].start()),
- static_cast<unsigned int>(win[2].end() - win[2].start()),
- static_cast<unsigned int>(win[3].end() - win[3].start()),
- static_cast<unsigned int>(win[4].end() - win[4].start()),
- static_cast<unsigned int>(win[5].end() - win[5].start())
- };
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
- * @return the resultant ndcoord_t
- */
-inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
-{
- return
- {
- { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
- { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
- { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
- { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
- { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
- { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
- };
-}
-
-} //namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp b/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
deleted file mode 100644
index 78e0adf31f..0000000000
--- a/src/core/cpu/kernels/assembly/arm_gemm_local.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-using CPUModel = arm_compute::CPUModel;
-using CPUInfo = arm_compute::CPUInfo;
diff --git a/src/core/cpu/kernels/assembly/convolution_parameters.hpp b/src/core/cpu/kernels/assembly/convolution_parameters.hpp
deleted file mode 100644
index 0c1ae58902..0000000000
--- a/src/core/cpu/kernels/assembly/convolution_parameters.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstdint>
-
-namespace arm_gemm
-{
-/*
- * Parameter set for "convolution" type GEMM.
- *
- * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
- * an im2row had been performed on the input tensor to generate the operand
- * matrix, but instead this structure describes the convolution parameters
- * such that this can be done on the fly.
- *
- * The parameters describe the convolution details - the notional shape of
- * the input and output tensors, whether padding is to be applied, the size
- * of the kernel and a constant value to be used for padding (needed for
- * quantized tensors).
- *
- * The second part describes the layout of the input tensor in memory, which
- * is assumed to be in NHWC format. This consists of a base pointer and
- * strides for columns, rows and batches. 'multis' are not supported for
- * convolution type GEMMs.
- */
-struct ConvolutionParameters
-{
- int64_t input_width;
- int64_t input_height;
- int64_t input_channels;
- int64_t kernel_width;
- int64_t kernel_height;
- int64_t output_width;
- int64_t output_height;
- int64_t output_stride_w;
- int64_t output_stride_h;
- // output_channels not included as they do not affect the input.
- int64_t padding_top;
- int64_t padding_left;
- float padding_value;
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/gemm_common.hpp b/src/core/cpu/kernels/assembly/gemm_common.hpp
deleted file mode 100644
index 4af85ed663..0000000000
--- a/src/core/cpu/kernels/assembly/gemm_common.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "convolution_parameters.hpp"
-#include "ndrange.hpp"
-
-#include <cstddef>
-
-namespace arm_gemm
-{
-// Abstract class for the GEMM/GEMV functions.
-//
-// GEMM implementations may be "native" (never require any input
-// permutation), "pretransposed" (require permutation up-front) or require
-// working space (permute as they go along). This interface should support
-// all of them.
-
-// The real GemmCommon class is templated based on the operand and return
-// type. This is an interface class which is independent of those types.
-class IGemmCommon
-{
-public:
- /* Pass in the pointers to the arrays to be operated on and their
- * strides. This "generic" version uses void *s, the preferred version
- * is the one provided by templated GemmCommon (below) which takes
- * appropriately typed pointers. If B is pretransposed (see below) then
- * the settings for B here are ignored.
- */
- virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
-
- /** @returns an ndrange containing ranges of the compute space which can be
- * broken up and parallelised over
- */
- virtual ndrange_t get_window_size() const = 0;
-
- /* The maximum thread count is specified when the GEMM is created. Some
- * implementations need to know how many threads will actually run in
- * order to work properly.
- *
- * In some cases, after creating the GEMM the number of threads needs to
- * be reduced (e.g. not enough work to split across threads). This
- * method allows the number of actual threads to be run to be set (must
- * be equal or lower).
- *
- * This has an empty default implementation, as GEMMs which don't care
- * about thread count can safely ignore this.
- */
- virtual void set_nthreads(int) {};
-
- /* Whether this GEMM can be dynamically scheduled or not. */
- virtual bool supports_dynamic_scheduling() const
- {
- return false;
- }
-
- /** Main execute member fucntion
- * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size()
- * @param [in] thread_locator where are we inside of the thread space
- * @param [in] threadid a unique threadid
- */
- virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
-
- /*** Working space interface (optional) ***/
- /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */
- virtual size_t get_working_size() const
- {
- return 0;
- }
- /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
- virtual void set_working_space(void *) {};
-
- /*** "Pretransposed" interface (optional) ***/
- /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */
- virtual bool B_is_pretransposed() const
- {
- return false;
- }
- /* Does pretranspose still need to be done? */
- virtual bool B_pretranspose_required() const
- {
- return false;
- }
- /* Total number of bytes of space needed for pretransposed arrays. */
- virtual size_t get_B_pretransposed_array_size() const
- {
- return 0;
- }
- /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
- /* The "real" version of this depends on the templated operand type (see below). */
- virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
- /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
- virtual void set_pretransposed_B_data(void *)
- {
- }
-
- /*** "Quantized bias" interface (optional) ***/
- /* Set the bias vector for quantized GEMMs */
- virtual void set_quantized_bias(const int32_t *, size_t)
- {
- }
-
- /*** Indirect interface (optional) ***/
- /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers,
- * multis * batches * kernel_points */
- virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
- {
- }
-
- /*** Convolution interface (optional) ***/
- /* Set the convolution parameters. */
- virtual void set_convolution_parameters(ConvolutionParameters)
- {
- }
-
- // Destructor
- virtual ~IGemmCommon()
- {
- }
-};
-
-/* "Real" GemmCommon class which is templated on the operand and return types.
- *
- * In addition to correctly typed versions of the functions that operate on
- * operand and return data, this class provides a default implementation of
- * 'set_arrays' to capture the provided arguments in protected class
- * members, as essentially any implementation will need these.
- */
-template <typename To, typename Tr>
-class GemmCommon : public IGemmCommon
-{
-protected:
- const To *_Aptr = nullptr;
- int _lda = 0;
- int _A_batch_stride = 0;
- int _A_multi_stride = 0;
- const To *_Bptr = nullptr;
- int _ldb = 0;
- int _B_multi_stride = 0;
- Tr *_Cptr = nullptr;
- int _ldc = 0;
- int _C_batch_stride = 0;
- int _C_multi_stride = 0;
- const Tr *_bias = nullptr;
- int _bias_multi_stride = 0;
-
-public:
- /* Pass in the pointers to the arrays to be operated on and their
- * strides (templated version with appropriate types). */
- virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
- Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
- {
- _Aptr = A;
- _lda = lda;
- _A_batch_stride = A_batch_stride;
- _A_multi_stride = A_multi_stride;
- _Bptr = B;
- _ldb = ldb;
- _B_multi_stride = B_multi_stride;
- _Cptr = C;
- _ldc = ldc;
- _C_batch_stride = C_batch_stride;
- _C_multi_stride = C_multi_stride;
- _bias = bias;
- _bias_multi_stride = bias_multi_stride;
- }
-
- /* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
- const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
- void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
- const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
- {
- set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
- static_cast<const To *>(B), ldb, B_multi_stride,
- static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
- static_cast<const Tr *>(bias), bias_multi_stride);
- }
-
- /*** "Pretransposed" interface ***/
-
- /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
- /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
- virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
-
- /* Implementation of the void * overload which casts its arguments to the appropriate type. */
- void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
- {
- pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
- }
-
- /*** Indirect interface ***/
- virtual void set_indirect_parameters(size_t, const To *const *const *)
- {
- }
-
- void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
- {
- set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
- }
-};
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/assembly/ndrange.hpp b/src/core/cpu/kernels/assembly/ndrange.hpp
deleted file mode 100644
index 1c8261aef7..0000000000
--- a/src/core/cpu/kernels/assembly/ndrange.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <cassert>
-#include <initializer_list>
-
-namespace arm_gemm
-{
-template <unsigned int D>
-class NDRange
-{
-private:
- std::array<unsigned int, D> m_sizes{};
- std::array<unsigned int, D> m_totalsizes{};
-
- class NDRangeIterator
- {
- private:
- const NDRange &m_parent;
- unsigned int m_pos = 0;
- unsigned int m_end = 0;
-
- public:
- NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
- : m_parent(p), m_pos(s), m_end(e)
- {
- }
-
- bool done() const
- {
- return (m_pos >= m_end);
- }
-
- unsigned int dim(unsigned int d) const
- {
- unsigned int r = m_pos;
-
- if(d < (D - 1))
- {
- r %= m_parent.m_totalsizes[d];
- }
-
- if(d > 0)
- {
- r /= m_parent.m_totalsizes[d - 1];
- }
-
- return r;
- }
-
- bool next_dim0()
- {
- m_pos++;
-
- return !done();
- }
-
- bool next_dim1()
- {
- m_pos += m_parent.m_sizes[0] - dim(0);
-
- return !done();
- }
-
- unsigned int dim0_max() const
- {
- unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
- return dim(0) + offset;
- }
- };
-
- void set_totalsizes()
- {
- unsigned int t = 1;
-
- for(unsigned int i = 0; i < D; i++)
- {
- if(m_sizes[i] == 0)
- {
- m_sizes[i] = 1;
- }
-
- t *= m_sizes[i];
-
- m_totalsizes[i] = t;
- }
- }
-
-public:
- NDRange &operator=(const NDRange &rhs) = default;
- NDRange(const NDRange &rhs) = default;
-
- template <typename... T>
- NDRange(T... ts)
- : m_sizes{ ts... }
- {
- set_totalsizes();
- }
-
- NDRange(const std::array<unsigned int, D> &n)
- : m_sizes(n)
- {
- set_totalsizes();
- }
-
- NDRangeIterator iterator(unsigned int start, unsigned int end) const
- {
- return NDRangeIterator(*this, start, end);
- }
-
- unsigned int total_size() const
- {
- return m_totalsizes[D - 1];
- }
-
- unsigned int get_size(unsigned int v) const
- {
- return m_sizes[v];
- }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template <unsigned int N>
-class NDCoordinate : public NDRange<N>
-{
- using int_t = unsigned int;
- using ndrange_t = NDRange<N>;
-
- std::array<int_t, N> m_positions{};
-
-public:
- NDCoordinate &operator=(const NDCoordinate &rhs) = default;
- NDCoordinate(const NDCoordinate &rhs) = default;
- NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
- {
- std::array<int_t, N> sizes{};
-
- std::size_t i = 0;
- for(auto &p : list)
- {
- m_positions[i] = p.first;
- sizes[i++] = p.second;
- }
-
- //update the parents sizes
- static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
- }
-
- int_t get_position(int_t d) const
- {
- assert(d < N);
-
- return m_positions[d];
- }
-
- void set_position(int_t d, int_t v)
- {
- assert(d < N);
-
- m_positions[d] = v;
- }
-
- int_t get_position_end(int_t d) const
- {
- return get_position(d) + ndrange_t::get_size(d);
- }
-}; //class NDCoordinate
-
-using ndrange_t = NDRange<6>;
-using ndcoord_t = NDCoordinate<6>;
-
-} // namespace arm_gemm
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
deleted file mode 100644
index 43e44be5e2..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H
-
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
- int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
- int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const auto a = *(non_broadcast_input_ptr + x);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
- for(; x < window_end_x; ++x)
- {
- const auto a = *(input1_ptr + x);
- const auto b = *(input2_ptr + x);
- *(output_ptr + x) = (*scalar_func)(a, b);
- }
- },
- input1, input2, output);
- }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
-{
- auto res = ScalarType(0);
-
- switch(op)
- {
- case ArithmeticOperation::MAX:
- res = std::max(a, b);
- break;
- case ArithmeticOperation::MIN:
- res = std::min(a, b);
- break;
- case ArithmeticOperation::SQUARED_DIFF:
- {
- res = (a - b) * (a - b);
- break;
- }
- case ArithmeticOperation::PRELU:
- {
- res = (a > 0 ? a : a * b);
- break;
- }
- case ArithmeticOperation::DIV:
- {
- res = a / b;
- if(std::is_integral<ScalarType>::value)
- {
- res = (b == 0) ? 0 : res;
- if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
- {
- --res;
- }
- }
- break;
- }
- case ArithmeticOperation::POWER:
- {
- res = std::pow(a, b);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
-{
- using vec_type = typename VectorType::type;
- using scalar_type = typename VectorType::scalar_type;
- using tag_type = typename VectorType::tag_type;
-
- vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
-
- switch(op)
- {
- case ArithmeticOperation::MAX:
- res = wrapper::vmax(a, b);
- break;
- case ArithmeticOperation::MIN:
- res = wrapper::vmin(a, b);
- break;
- case ArithmeticOperation::SQUARED_DIFF:
- {
- const vec_type tmp = wrapper::vsub(a, b);
- res = wrapper::vmul(tmp, tmp);
- break;
- }
- case ArithmeticOperation::PRELU:
- {
- const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
- const vec_type tmp = wrapper::vmul(a, b);
- const auto gt = wrapper::vcgt(a, zero);
-
- res = wrapper::vbsl(gt, a, tmp);
- break;
- }
-
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- return res;
-}
-
-template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
-{
- return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
- return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
-{
- return wrapper::vpow(a, b);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
- return wrapper::vdiv(a, b);
-}
-
-template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
-{
- return wrapper::vpow(a, b);
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
-{
- using tag_type = typename VectorType::tag_type;
- using vec_type = typename VectorType::type;
-
- vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
- return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
- }
- return x;
-}
-
-template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
- wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
- }
- return x;
-}
-
-template <ArithmeticOperation op, typename VectorType>
-void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- using scalar_type = typename VectorType::scalar_type;
-
- elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
- &elementwise_arithm_op_scalar<op, scalar_type>,
- &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
- &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType>
-inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
-{
- bool res = false;
-
- switch(op)
- {
- case ComparisonOperation::Equal:
- res = (a == b);
- break;
- case ComparisonOperation::NotEqual:
- res = (a != b);
- break;
- case ComparisonOperation::Greater:
- res = (a > b);
- break;
- case ComparisonOperation::GreaterEqual:
- res = (a >= b);
- break;
- case ComparisonOperation::Less:
- res = (a < b);
- break;
- case ComparisonOperation::LessEqual:
- res = (a <= b);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
- return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
-}
-
-template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
-{
- OutputVectorType res = { 0, 0, 0, 0 };
-
- switch(op)
- {
- case ComparisonOperation::Equal:
- res = wrapper::vceq(a, b);
- break;
- case ComparisonOperation::NotEqual:
- res = wrapper::vnot(wrapper::vceq(a, b));
- break;
- case ComparisonOperation::Greater:
- res = wrapper::vcgt(a, b);
- break;
- case ComparisonOperation::GreaterEqual:
- res = wrapper::vcge(a, b);
- break;
- case ComparisonOperation::Less:
- res = wrapper::vcgt(b, a);
- break;
- case ComparisonOperation::LessEqual:
- res = wrapper::vcge(b, a);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- return res;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
-{
- InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
- return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, a);
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
- const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
- }
- if(x <= window_end_x - 4)
- {
- const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
- for(int i = 0; i < 4; i++)
- {
- *(output_ptr + x + i) = wrapper::vgetlane(a, i);
- }
- x = +4;
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
- wrapper::vstore(output_ptr + x, res);
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
- const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- auto a = wrapper::vloadq(input1_ptr + x);
- auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- a = wrapper::vloadq(input1_ptr + x + 4);
- b = wrapper::vloadq(input2_ptr + x + 4);
- const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
- }
- if(x <= window_end_x - 4)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
- const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
- for(int i = 0; i < 4; i++)
- {
- *(output_ptr + x + i) = wrapper::vgetlane(res, i);
- }
- x = +4;
- }
- return x;
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
- &elementwise_comp_op_scalar<op, InputScalarType>,
- &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
- &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
-}
-} // namesapce cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
deleted file mode 100644
index 1ff4632f5c..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
+++ /dev/null
@@ -1,654 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
- qasymm8x16_t x = vld1q_u8(input1_ptr);
- const float32x4x4_t out =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
- }
- };
- return out;
-}
-
-float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
-{
- qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
- const float32x4x4_t out =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
- }
- };
- return out;
-}
-
-void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out)
-{
- const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1])));
- const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3])));
- vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out)
-{
- const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
- const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
- vst1q_u8(output_ptr, vcombine_u8(pa, pb));
-}
-
-void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
- int32x4x4_t out =
- {
- {
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
- }
- };
- store_quantized(output_ptr, out);
-}
-
-void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out)
-{
- const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1])));
- const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3])));
- vst1q_s8(output_ptr, vcombine_s8(pa, pb));
-}
-
-void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
-{
- int32x4x4_t out =
- {
- {
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
- vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
- }
- };
- store_quantized_signed(output_ptr, out);
-}
-
-template <ArithmeticOperation op>
-inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
- return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
- return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
-}
-
-template <ArithmeticOperation op>
-inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
- using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
- float32x4x4_t out =
- {
- {
- elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
- elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
- }
- };
- return out;
-}
-
-template <ComparisonOperation op>
-inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
-{
- ARM_COMPUTE_UNUSED(qinfo);
- return elementwise_comp_op_scalar<op>(a, b);
-}
-
-template <ComparisonOperation op>
-inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
- uint32x4x4_t out =
- {
- {
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
- elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
- }
- };
- return out;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get inputs and compute output
- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
- const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Get inputs and compute output
- const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
- const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
- store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
- store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
- store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
- }
- return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
- const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
- store_quantized(output_ptr + x, rf);
- }
- return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
- int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
- float32x4_t voffseto, float32x4_t invvscaleo)
-{
- ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
- const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
- const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
- store_quantized(output_ptr + x, rf);
- }
- return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
- store_quantized(output_ptr + x, rf);
- }
- return x;
-}
-
-template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
- const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
- int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
- float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
- ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
- const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
- store_quantized(output_ptr + x, rf);
- }
- return x;
-}
-
-void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
- // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
- const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
- if(is_broadcast_across_x)
- {
- // Select the broadcast input on the X axis
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
- const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
- // Input1 quantization info
- const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
- const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
-
- // Input2 quantization info
- const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
- const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
- }
-}
-
-void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
- const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
- if(is_broadcast_across_x)
- {
- // Select the broadcast input on the X axis
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
- const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
- // Input1 quantization info
- const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
- const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
-
- // Input2 quantization info
- const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
- const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
- }
-}
-
-void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
- int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
- float32x4_t, float32x4_t, const bool),
- int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
- int32x4_t, int32x4_t, float32x4_t, float32x4_t,
- float32x4_t, float32x4_t))
-{
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
-
- const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
-
- if(is_broadcast_across_x)
- {
- // Select the broadcast input on the X axis
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
- const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
- int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
- voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
- const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
- *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
- const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
-
- // Input1 quantization info
- const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
- const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
-
- // Input2 quantization info
- const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
- const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
- vscale1, vscale2, voffseto, invvscaleo);
- for(; x < window_end_x; ++x)
- {
- const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
- const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
- *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
- }
- },
- input1, input2, output);
- }
-}
-
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
- &elementwise_arithm_op_quantized_broadcast_loop<op>,
- &elementwise_arithm_op_quantized_loop<op>);
-}
-template <ArithmeticOperation op>
-void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
- &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
- &elementwise_arithm_op_quantized_singed_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
- &elementwise_comp_op_quantized_broadcast_loop<op>,
- &elementwise_comp_op_quantized_loop<op>);
-}
-
-template <ComparisonOperation op>
-void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
- &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
- &elementwise_comp_op_quantized_signed_loop<op>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
deleted file mode 100644
index 307e95fae9..0000000000
--- a/src/core/cpu/kernels/elementwise/neon/elementwise_unary_list.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
-{
- switch(op)
- {
- case ElementWiseUnary::RSQRT:
- return 1 / sqrt(a);
- case ElementWiseUnary::EXP:
- return std::exp(a);
- case ElementWiseUnary::NEG:
- return -a;
- case ElementWiseUnary::LOG:
- return std::log(a);
- case ElementWiseUnary::ABS:
- return std::abs(a);
- case ElementWiseUnary::ROUND:
- return support::cpp11::nearbyint(a);
- case ElementWiseUnary::SIN:
- return std::sin(a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <typename ScalarType, typename VectorType>
-inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
-{
- switch(op)
- {
- case ElementWiseUnary::RSQRT:
- return wrapper::vinvsqrt(a);
- case ElementWiseUnary::EXP:
- return wrapper::vexpq(a);
- case ElementWiseUnary::NEG:
- return wrapper::vneg(a);
- case ElementWiseUnary::LOG:
- return wrapper::vlog(a);
- case ElementWiseUnary::ABS:
- return wrapper::vabs(a);
- case ElementWiseUnary::ROUND:
- return wrapper::vround(a);
- case ElementWiseUnary::SIN:
- return wrapper::vsin(a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-}
-
-template <typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
- const int window_step_x = 16 / sizeof(ScalarType);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(in, win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
- int x = window_start_x;
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
- }
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
- }
- },
- input, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
deleted file mode 100644
index 58ebb28fe5..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct LoopArguments
-{
- OperatorType op;
- const InputScalarType *input1_ptr;
- const InputScalarType *input2_ptr;
- OutputScalarType *output_ptr;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastLoopArguments
-{
- OperatorType op;
- const InputScalarType *input1_ptr;
- InputScalarType broadcast_value;
- OutputScalarType *output_ptr;
- bool reorder;
-};
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
- const auto in1 = svld1(pg, args.input1_ptr);
- const auto in2 = svld1(pg, args.input2_ptr);
- const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
- svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
- const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
- const auto broadcast_vector = svdup_n(args.broadcast_value);
- const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector;
- const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector;
- const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
- svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
- const auto in1 = svld1(pg, args.input1_ptr);
- const auto in2 = svld1(pg, args.input2_ptr);
- const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
- const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
- svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
- const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
- const auto broadcast_vector = svdup_n(args.broadcast_value);
- const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector;
- const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector;
- const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
- const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
- svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
- typename InputScalarType = typename sve_scalar<InputVectorType>::type,
- typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OperatorType op,
- LoopFuncType<InputScalarType, OutputScalarType, OperatorType> func,
- BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
- const auto all_true_pg = svptrue<InputScalarType>();
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = window_start_x;
-
- svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- broadcast_func(pg,
- {
- op,
- non_broadcast_input_ptr + x,
- broadcast_value,
- output_ptr + x,
- !is_broadcast_input_2
- });
- x += svcnt<InputScalarType>();
- pg = svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = window_start_x;
-
- svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- func(pg,
- {
- op,
- input1_ptr + x,
- input2_ptr + x,
- output_ptr + x
- });
- x += svcnt<InputScalarType>();
- pg = svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
- }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- using VectorType = typename sve_vector<ScalarType>::type;
-
- elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
- &arithmetic_op_loop<ScalarType, ScalarType>,
- &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
- using InputVectorType = typename sve_vector<InputScalarType>::type;
- using OutputVectorType = typename sve_vector<OutputScalarType>::type;
-
- elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
- &comparison_op_loop<InputScalarType, OutputScalarType>,
- &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
-}
-
-template <>
-svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
- return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
- return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
-template <>
-svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
-{
- ARM_COMPUTE_UNUSED(pg, a, b);
- ARM_COMPUTE_ERROR("Not supported");
-}
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
deleted file mode 100644
index a92a8648a8..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename VectorType>
-VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
- return svpow_z(pg, a, b);
-}
-
-template <typename VectorType>
-VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
-{
- return svdiv_z(pg, a, b);
-}
-
-template <uint32_t bytewidth>
-svbool_t narrow_to_byte_predicate(svbool_t pg)
-{
- const auto all_false = svpfalse();
-
- switch(bytewidth)
- {
- case 8:
- pg = svuzp1_b32(pg, all_false);
- /* fall through */
- case 4:
- pg = svuzp1_b16(pg, all_false);
- /* fall through */
- case 2:
- pg = svuzp1_b8(pg, all_false);
- /* fall through */
- default:
- break;
- }
- return pg;
-}
-
-template <typename VectorType>
-VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
-{
- using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
- VectorType res{};
-
- switch(op)
- {
- case ArithmeticOperation::MAX:
- res = svmax_z(pg, a, b);
- break;
- case ArithmeticOperation::MIN:
- res = svmin_z(pg, a, b);
- break;
- case ArithmeticOperation::SQUARED_DIFF:
- {
- const auto tmp = svsub_z(pg, a, b);
- res = svmul_z(pg, tmp, tmp);
- break;
- }
- case ArithmeticOperation::PRELU:
- {
- const auto zero = svdup_n(ScalarType(0));
- const auto tmp = svmul_z(pg, a, b);
- const auto gt = svcmpgt(pg, a, zero);
- res = svsel(gt, a, tmp);
- break;
- }
- case ArithmeticOperation::DIV:
- {
- res = elementwise_div(pg, a, b);
- break;
- }
- case ArithmeticOperation::POWER:
- {
- res = elementwise_pow(pg, a, b);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- return res;
-}
-
-template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
-{
- svbool_t selection_vector{};
-
- switch(op)
- {
- case ComparisonOperation::Equal:
- selection_vector = svcmpeq(pg, a, b);
- break;
- case ComparisonOperation::NotEqual:
- selection_vector = svcmpne(pg, a, b);
- break;
- case ComparisonOperation::Greater:
- selection_vector = svcmpgt(pg, a, b);
- break;
- case ComparisonOperation::GreaterEqual:
- selection_vector = svcmpge(pg, a, b);
- break;
- case ComparisonOperation::Less:
- selection_vector = svcmplt(pg, a, b);
- break;
- case ComparisonOperation::LessEqual:
- selection_vector = svcmple(pg, a, b);
- break;
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
- }
-
- using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
- selection_vector = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
-
- using OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type;
- const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
- const auto true_vector = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
- auto ret = svsel(selection_vector, true_vector, false_vector);
-
- return ret;
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-
-template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
deleted file mode 100644
index 6c5524e284..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
-
-#if defined(__ARM_FEATURE_SVE2)
-
-#include "src/core/NEON/wrapper/svtraits.h"
-#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-using namespace arm_compute::wrapper;
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct QuantizedLoopArguments
-{
- OperatorType op;
- const InputScalarType *input1_ptr;
- const InputScalarType *input2_ptr;
- OutputScalarType *output_ptr;
-
- const svint32_t &in1_offset;
- const svint32_t &in2_offset;
- const svint32_t &out_offset;
- const svfloat32_t &in1_scale;
- const svfloat32_t &in2_scale;
- const svfloat32_t &out_scale;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastQuantizedLoopArguments
-{
- OperatorType op;
- const InputScalarType *input1_ptr;
- float broadcast_value;
- OutputScalarType *output_ptr;
- bool reorder;
-
- const svint32_t &in1_offset;
- const svint32_t &out_offset;
- const svfloat32_t &in1_scale;
- const svfloat32_t &out_scale;
-};
-
-svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
- auto x = svld1(pg, ptr);
-
- const auto widened = svcreate4(
- svmovlb(svmovlb(x)),
- svmovlt(svmovlb(x)),
- svmovlb(svmovlt(x)),
- svmovlt(svmovlt(x)));
-
- pg = svptrue_b8();
-
- return svcreate4(
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
-}
-
-svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
-{
- auto x = svld1(pg, ptr);
-
- //vprint(x);
-
- const auto widened = svcreate4(
- svmovlb(svmovlb(x)),
- svmovlt(svmovlb(x)),
- svmovlb(svmovlt(x)),
- svmovlt(svmovlt(x)));
-
- pg = svptrue_b8();
-
- return svcreate4(
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
- svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
-}
-
-void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
- const auto quantized = svcreate4(
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
- const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
- const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
- const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
- svst1(pg, ptr, narrowed);
-}
-
-void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
-{
- const auto quantized = svcreate4(
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
- svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
-
- const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
- const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
- const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top);
-
- svst1(pg, ptr, narrowed);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
- const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
- const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
- const auto result = svcreate4(
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
- store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
- const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
- const auto in2 = svcreate4(
- svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
- const auto &af = args.reorder ? in2 : in1;
- const auto &bf = args.reorder ? in1 : in2;
-
- const auto result = svcreate4(
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op),
- elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
- store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
- const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
- const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
-
- using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
- const auto result = svcreate4(
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op));
-
- const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
- const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
- const auto zipped = svzip1(zipped_bottom, zipped_top);
- svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
- const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
- const auto in2 = svcreate4(
- svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value));
-
- const auto &af = args.reorder ? in2 : in1;
- const auto &bf = args.reorder ? in1 : in2;
-
- using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
-
- const auto result = svcreate4(
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op),
- elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op));
-
- const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
- const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3));
- const auto zipped = svzip1(zipped_bottom, zipped_top);
- svst1(pg, args.output_ptr, zipped);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
- typename InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type,
- typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
-void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
- OperatorType op,
- LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType> func,
- BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
- const auto all_true_pg = wrapper::svptrue<InputScalarType>();
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
- const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
- const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
- const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
- const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
-
- const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
- const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
- const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
- {
- op,
- non_broadcast_input_ptr + x,
- Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo),
- output_ptr + x,
- !is_broadcast_input_2,
- non_broadcast_voffset, output_voffset,
- non_broadcast_vscale, output_vscale
- };
- broadcast_func(pg, args);
- x += wrapper::svcnt<InputScalarType>();
- pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(in1, input1_win);
- Iterator input2(in2, input2_win);
- Iterator output(out, win);
-
- const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset);
- const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale);
-
- const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
- const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
- const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- do
- {
- const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType>
- {
- op,
- input1_ptr + x,
- input2_ptr + x,
- output_ptr + x,
- in1_voffset, in2_voffset, output_voffset,
- in1_vscale, in2_vscale, output_vscale
- };
- func(pg, args);
- x += wrapper::svcnt<InputScalarType>();
- pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input1, input2, output);
- }
-}
-
-template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
- elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
- &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
- &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
- static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
- using InputVectorType = typename wrapper::traits::sve_vector<InputScalarType>::type;
- using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
- elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
- &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
- &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_SVE2) */
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
deleted file mode 100644
index ddf1febd66..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
- switch(op)
- {
- case ElementWiseUnary::RSQRT:
- return svinvsqrt(pg, a);
- case ElementWiseUnary::EXP:
- return wrapper::svexp_z(pg, a);
- case ElementWiseUnary::NEG:
- return svneg_z(pg, a);
- case ElementWiseUnary::LOG:
- return wrapper::svlog_z(pg, a);
- case ElementWiseUnary::ABS:
- return svabs_z(pg, a);
- case ElementWiseUnary::ROUND:
- return svrintn_z(pg, a);
- case ElementWiseUnary::SIN:
- return wrapper::svsin_z(pg, a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED");
- }
-}
-
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
- switch(op)
- {
- case ElementWiseUnary::NEG:
- return svneg_z(pg, a);
- case ElementWiseUnary::ABS:
- return svabs_z(pg, a);
- default:
- ARM_COMPUTE_ERROR("NOT_SUPPORTED");
- }
-}
-
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
- const auto all_true_pg = wrapper::svptrue<ScalarType>();
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input(in, win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
- const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- int x = window_start_x;
-
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto vin = svld1(pg, input_ptr + x);
- svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
- },
- input, output);
-}
-
-template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
deleted file mode 100644
index 63490421e9..0000000000
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
-
-#include "arm_compute/core/Types.h"
-#if defined(ENABLE_SVE)
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
-} // namespace cpu
-} // namespace arm_compute
-#endif // defined(ENABLE_SVE)
-#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H \ No newline at end of file
diff --git a/src/core/cpu/kernels/floor/list.h b/src/core/cpu/kernels/floor/list.h
deleted file mode 100644
index 4367e0ffc9..0000000000
--- a/src/core/cpu/kernels/floor/list.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_FLOOR_KERNEL(func_name) \
- void func_name(const void *src, void *dst, int len)
-
-DECLARE_FLOOR_KERNEL(fp16_neon_floor);
-DECLARE_FLOOR_KERNEL(fp32_neon_floor);
-
-#undef DECLARE_FLOOR_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */
diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp
deleted file mode 100644
index f362676a36..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp16.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 8;
-
-void fp16_neon_floor(const void *src, void *dst, int len)
-{
- ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
- ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- ARM_COMPUTE_ASSERT(len >= 0);
-
- auto psrc = static_cast<const __fp16 *>(src);
- auto pdst = static_cast<__fp16 *>(dst);
-
- for(; len >= step; len -= step)
- {
- vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
- psrc += step;
- pdst += step;
- }
-
- for(; len > 0; --len)
- {
- *pdst = std::floor(*psrc);
- ++psrc;
- ++pdst;
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp
deleted file mode 100644
index f5efb2e849..0000000000
--- a/src/core/cpu/kernels/floor/neon/fp32.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 4;
-
-void fp32_neon_floor(const void *src, void *dst, int len)
-{
- ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
- ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- ARM_COMPUTE_ASSERT(len >= 0);
-
- auto psrc = static_cast<const float *>(src);
- auto pdst = static_cast<float *>(dst);
-
- for(; len >= step; len -= step)
- {
- vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
- psrc += step;
- pdst += step;
- }
-
- for(; len > 0; --len)
- {
- *pdst = std::floor(*psrc);
- ++pdst;
- ++psrc;
- }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
deleted file mode 100644
index c78ffb9848..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-using namespace arm_compute::misc::shape_calculator;
-
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
- // dst initialization if not yet initialized
- auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info)));
-
- const bool requantize = src->quantization_info() != dst->quantization_info();
-
- switch(src->data_type())
- {
- case DataType::QASYMM8:
- if(requantize)
- {
- create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
- }
- else
- {
- create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info);
- }
- break;
- case DataType::QASYMM8_SIGNED:
- if(requantize)
- {
- create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
- }
- else
- {
- create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info);
- }
- break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- case DataType::F16:
- create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info);
- break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- case DataType::F32:
- create_arm_pooling<float, float>(src, dst, info, cpu_info);
- break;
- default:
- break;
- }
-
- Window win = calculate_max_window(*dst, Steps());
- INEKernel::configure(win);
-}
-
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-
-#ifndef __aarch64__
- ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
-#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
- "Only AVG and MAX pooling are supported by assembly kernels");
-
- if(dst->total_size() > 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-
- const auto src_qinfo = src->quantization_info().uniform();
- const auto dst_qinfo = dst->quantization_info().uniform();
-
- if(src_qinfo != dst_qinfo)
- {
- const float multiplier = src_qinfo.scale / dst_qinfo.scale;
- int32_t dst_multiplier{};
- int32_t dst_shift{};
- ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
- }
- else
- {
- if(src->data_type() == DataType::QASYMM8)
- {
- const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
- }
- }
- }
- else
- {
- if(src->data_type() == DataType::QASYMM8)
- {
- // If dst is not configured, the quantization info are the same
- const bool has_padding = info.pad_stride_info.has_padding();
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
- }
- }
- return Status{};
-}
-
-void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_UNUSED(window);
- ARM_COMPUTE_UNUSED(info);
-
- ARM_COMPUTE_ERROR_ON(tensors.empty());
-
- const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
- ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
-
- const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
- auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
- auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
-
- const auto src_shape = src->info()->tensor_shape();
- const auto dst_shape = dst->info()->tensor_shape();
- const auto src_padding = src->info()->padding();
- const auto dst_padding = dst->info()->padding();
-
- const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
- const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
- const size_t ld_src_batch = ld_src_row * src_shape[2];
- const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
- const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
- const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
-
- _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
- out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
- working_space, info.thread_id, info.num_threads);
-}
-
-size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
-{
- return _kernel_asm->get_working_size(num_threads);
-}
-
-bool CpuPool2dAssemblyWrapperKernel::is_configured() const
-{
- return _kernel_asm != nullptr;
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
- arm_conv::pooling::PoolingWindow window{};
- window.cols = static_cast<unsigned int>(info.pool_size.x());
- window.rows = static_cast<unsigned int>(info.pool_size.y());
-
- arm_conv::pooling::PoolingStride stride{};
- std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
- constexpr unsigned int idx_width = 1;
- constexpr unsigned int idx_height = 2;
- constexpr unsigned int idx_channels = 0;
- constexpr unsigned int idx_batches = 3;
-
- const unsigned int n_batches = src->dimension(idx_batches);
- const unsigned int src_rows = src->dimension(idx_height);
- const unsigned int src_cols = src->dimension(idx_width);
- const unsigned int n_channels = src->dimension(idx_channels);
- const unsigned int dst_rows = dst->dimension(idx_height);
- const unsigned int dst_cols = dst->dimension(idx_width);
-
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
- // Configure assembly pooling kernel
- auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
- if(pooling_kernel_asm == nullptr)
- {
- // Configuration not supported: Leave function unconfigured:
- return;
- }
-
- _kernel_asm = std::move(pooling_kernel_asm);
-}
-
-template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
-{
- const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
-
- arm_conv::pooling::PoolingWindow window{};
- window.cols = static_cast<unsigned int>(info.pool_size.x());
- window.rows = static_cast<unsigned int>(info.pool_size.y());
-
- arm_conv::pooling::PoolingStride stride{};
- std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
-
- const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
-
- constexpr unsigned int idx_width = 1;
- constexpr unsigned int idx_height = 2;
- constexpr unsigned int idx_channels = 0;
- constexpr unsigned int idx_batches = 3;
-
- const unsigned int n_batches = src->dimension(idx_batches);
- const unsigned int src_rows = src->dimension(idx_height);
- const unsigned int src_cols = src->dimension(idx_width);
- const unsigned int n_channels = src->dimension(idx_channels);
- const unsigned int dst_rows = dst->dimension(idx_height);
- const unsigned int dst_cols = dst->dimension(idx_width);
-
- arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
-
- const auto src_qinfo = src->quantization_info().uniform();
- const auto dst_qinfo = dst->quantization_info().uniform();
-
- const float multiplier = src_qinfo.scale / dst_qinfo.scale;
- int32_t dst_multiplier{};
- int32_t dst_shift{};
- quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
-
- const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
- dst_qinfo.offset,
- dst_shift, // left shift
- 0, // right shift
- dst_multiplier);
-
- // Configure assembly pooling kernel with requantization
- auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
- if(pooling_kernel_asm == nullptr)
- {
- // Configuration not supported: Leave function unconfigured:
- return;
- }
-
- _kernel_asm = std::move(pooling_kernel_asm);
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
deleted file mode 100644
index 3afa4c16a4..0000000000
--- a/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
-#include "src/core/common/Macros.h"
-#include "src/core/cpu/ICpuKernel.h"
-
-#include "pool_common.hpp"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-/** This class is a wrapper for the assembly kernels.
- *
- * Some kernels were written in assembly and highly optimised for specific
- * CPUs like A53 or A55. The arm compute library creates an instance of
- * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
- * execute a single assembly kernel in the context of an NEFunction.
- *
- */
-class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
-{
-public:
- /** Constructor
- */
- CpuPool2dAssemblyWrapperKernel() = default;
- ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
-
- const char *name() const override
- {
- return "CpuPool2dAssemblyWrapperKernel";
- }
-
- /** Initialise the kernel's src and dst.
- *
- * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
- * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src.
- * @param[in] info Pooling meta-data.
- * @param[in] cpu_info CPU information needed to select the most appropriate kernel.
- */
- void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
- /** Static function to check if given info will lead to a valid configuration
- *
- * Similar to CpuPool2dAssemblyWrapperKernel::configure()
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
-
- // Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
-
- /** Get size of the workspace needed by the assembly kernel.
- *
- * @param[in] num_threads Maximum number of threads that are going to be spawned.
- *
- * @return size of workspace
- */
- size_t get_working_size(unsigned int num_threads) const;
-
- /** Was the asm kernel successfully configured?
- *
- * @return True if the asm kernel is configured and ready to run
- */
- bool is_configured() const;
-
-private:
- /** Helper function to create the assembly kernel.
- *
- * @param[in] src Source tensor info.
- * @param[in] dst Destination tensor info.
- * @param[in] info Pooling layer meta-data.
- */
- template <typename Typesrc, typename Typedst>
- void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
- /** Helper function to create the assembly kernel with requantization support
- *
- * @param[in] src Source tensor info.
- * @param[in] dst Destination tensor info.
- * @param[in] info Pooling layer meta-data.
- */
- template <typename Typesrc, typename Typedst>
- void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
-
- std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
-};
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/src/core/cpu/kernels/pooling/neon/fp16.cpp b/src/core/cpu/kernels/pooling/neon/fp16.cpp
deleted file mode 100644
index 0aae7b8a57..0000000000
--- a/src/core/cpu/kernels/pooling/neon/fp16.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 8;
-
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, window_src);
- Iterator out(dst0, window_out);
- Iterator indices(dst1, window_out);
-
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
- const int pad_right = src->info()->padding().right;
- const int pad_left = src->info()->padding().left;
- const int pad_horizontal = pad_right + pad_left;
- const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
- const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z());
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
- const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
- const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
- const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
- const auto v_x0 = vld1q_f16(in_x0_ptr);
- const auto v_x1 = vld1q_f16(in_x1_ptr);
- const auto v_x2 = vld1q_f16(in_x2_ptr);
- const auto v_x3 = vld1q_f16(in_x3_ptr);
- float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
- // Store result
- vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-
- const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
- const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
- const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
- const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
- const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
- const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
- const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
- const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
- const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
- const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
- const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
- const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
- const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
- const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
- const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
- const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
- const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
- // Store indicies
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
- float16_t res = std::max(std::max(x2, x3), std::max(x0, x1));
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-
- const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
- const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
- const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
- const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
- // Store indices
- *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
- }
- },
- in, out, indices);
-}
-}
-
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
- {
- pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
- }
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 8;
-
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, window_src);
- Iterator out(dst0, window_out);
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- float16x8_t vres;
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x8_t scale_v = vdupq_n_f16(scale);
-
- // Perform pooling
- vres = vdupq_n_f16(0.0f);
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- vres = vaddq_f16(vres, vmulq_f16(data, data));
- }
- else
- {
- vres = vaddq_f16(vres, data);
- }
- }
- }
- // Divide by scale
- vres = vmulq_f16(vres, scale_v);
- }
- else
- {
- vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = vmaxq_f16(vres, data);
- }
- }
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
- vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
- }
-
- // Store result
- vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- float16_t res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- res += data * data;
- }
- else
- {
- res += data;
- }
- }
- }
-
- // Divide by scale
- res *= scale;
- }
- else
- {
- res = std::numeric_limits<float>::lowest();
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
- }
- }
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
- }
- },
- in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/fp32.cpp b/src/core/cpu/kernels/pooling/neon/fp32.cpp
deleted file mode 100644
index 4e41fdec7f..0000000000
--- a/src/core/cpu/kernels/pooling/neon/fp32.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 4;
-
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, window_src);
- Iterator out(dst0, window_out);
- Iterator indices(dst1, window_out);
-
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-
- float32x4_t vres;
- float res;
-
- const int pad_right = src->info()->padding().right;
- const int pad_left = src->info()->padding().left;
- const int pad_horizontal = pad_right + pad_left;
- const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
- const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z());
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
- const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
- const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
- const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z());
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
- const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
- const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
- const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
- const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
- const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
- const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
- const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
- vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
- // Store result
- vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-
- const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
- const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
- const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
- const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
- const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
- const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
- const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
- // Store indices
- vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
- const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
- const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
- const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
- res = std::max(std::max(x2, x3), std::max(x0, x1));
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-
- const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
- const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off;
- const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
- const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
- const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
- const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
- const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
-
- // Store indices
- *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
- }
- },
- in, out, indices);
-}
-}
-
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
- {
- pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
- }
- else
- {
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 4;
-
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, window_src);
- Iterator out(dst0, window_out);
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- float32x4_t vres;
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x4_t scale_v = vdupq_n_f32(scale);
-
- // Perform pooling
- vres = vdupq_n_f32(0.0f);
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- vres = vmlaq_f32(vres, data, data);
- }
- else
- {
- vres = vaddq_f32(vres, data);
- }
- }
- }
- // Divide by scale
- vres = vmulq_f32(vres, scale_v);
- }
- else
- {
- vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = vmaxq_f32(vres, data);
- }
- }
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
- static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
- };
- vres = l2_res;
- }
-
- // Store result
- vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- float res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- res += data * data;
- }
- else
- {
- res += data;
- }
- }
- }
-
- // Divide by scale
- res *= scale;
- }
- else
- {
- res = std::numeric_limits<float>::lowest();
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
- }
- }
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
- }
- },
- in, out);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/list.h b/src/core/cpu/kernels/pooling/neon/list.h
deleted file mode 100644
index bec1536f61..0000000000
--- a/src/core/cpu/kernels/pooling/neon/list.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/cpu/kernels/pooling/neon/quantized.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_POOLING_KERNEL(func_name) \
- void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
-
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc);
-
-#if defined(ENABLE_NCHW_KERNELS)
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw);
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
-
-DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw);
-DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw);
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-
-#undef DECLARE_POOLING_KERNEL
-
-template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
-{
- const int pad_left = info.padding().left;
- const int pad_right = info.padding().right;
- const int pad_top = info.padding().top;
- const int pad_bottom = info.padding().bottom;
- const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
- const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
- const int pad_horiz = pad_left + pad_right;
- const int pad_vert = pad_top + pad_bottom;
-
- if(data_layout == DataLayout::NCHW)
- {
- const uint32_t offset_base = padded_offset
- - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
- - pad_top * sizeof(T) /* top padding */
- - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
- - in_stride_w * id[3];
-
- return offset_base;
- }
- else
- {
- const uint32_t offset_base = padded_offset
- - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
- - pad_top * sizeof(T) // top padding
- - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
- - in_stride_w * id[3];
-
- return offset_base;
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp b/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
deleted file mode 100644
index 80eac684aa..0000000000
--- a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#ifdef ENABLE_NCHW_KERNELS
-namespace arm_compute
-{
-namespace cpu
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- ARM_COMPUTE_UNUSED(pool_info.pool_type);
- ARM_COMPUTE_UNUSED(pool_info.exclude_padding);
-
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- constexpr const int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
- float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()));
- float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
- float16x4_t res = {};
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- top_data = vmul_f16(top_data, top_data);
- middle_data = vmul_f16(middle_data, middle_data);
- bottom_data = vmul_f16(bottom_data, bottom_data);
- }
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x4_t scale_v = vdup_n_f16(scale);
- // Perform pooling
- const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
- res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
- res = vmul_f16(vpadd_f16(res, res), scale_v);
- }
- else
- {
- const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
- res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
- res = vpmax_f16(res, res);
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = vinv_f16(vinvsqrt_f16(res));
- }
-
- *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
- },
- in, out);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
-{
- float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
- return out;
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
-{
- return in;
-}
-
-template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- Iterator in(src, window_src);
- Iterator out(dst0, window);
- Iterator indices(dst1, window);
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const int pad_left = src->info()->padding().left;
- const int pad_right = src->info()->padding().right;
- const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- auto top_data = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + in.offset()));
- auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + in.offset()));
- float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
- float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
-
- // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
- const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
- const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
- const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
- *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
-
- // Calculate max data indice, which will be used in max unpool.
- const uint32_t offset_base = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
- const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T));
- const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
- const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
- const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
- const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
- const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
- *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
- },
- in, out, indices);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- if(pool_info.pool_type == PoolingType::MAX && dst1)
- {
- pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
- }
- else
- {
- Iterator in(src, window_src);
- Iterator out(dst0, window);
- constexpr int pool_size = 2;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x, pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
- float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
- float16x4_t res = {};
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- top_data = vmul_f16(top_data, top_data);
- bottom_data = vmul_f16(bottom_data, bottom_data);
- }
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float16x4_t scale_v = vdup_n_f16(scale);
-
- const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
- res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
- }
- else
- {
- const float16x4_t max_data = vmax_f16(top_data, bottom_data);
- res = vpmax_f16(max_data, max_data);
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = vinv_f16(vinvsqrt_f16(res));
- }
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
- },
- in, out);
- }
-}
-
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float16_t res = 0.0f;
- float16x8_t vres = vdupq_n_f16(0.0f);
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
-
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 8); x += 8)
- {
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- vres = vaddq_f16(vres, vmulq_f16(data, data));
- }
- else
- {
- vres = vaddq_f16(vres, data);
- }
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- data *= data;
- }
-
- res += data;
- }
- }
-
- // Reduction
- float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
- res += vget_lane_f16(tmp, 0);
- res += vget_lane_f16(tmp, 1);
- res += vget_lane_f16(tmp, 2);
- res += vget_lane_f16(tmp, 3);
-
- // Divide by scale
- res *= scale;
- }
- else
- {
- float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest());
- res = std::numeric_limits<float>::lowest();
-
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 8); x += 8)
- {
- const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- vres = vmaxq_f16(vres, data);
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
- + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y())));
- res = std::max(res, data);
- }
- }
-
- float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
- res = std::max(res, vget_lane_f16(tmp, 0));
- res = std::max(res, vget_lane_f16(tmp, 1));
- res = std::max(res, vget_lane_f16(tmp, 2));
- res = std::max(res, vget_lane_f16(tmp, 3));
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
-
- // Store result
- *(reinterpret_cast<float16_t *>(out.ptr())) = res;
- },
- in, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float res = 0.0f;
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
- float32x4_t vres = vdupq_n_f32(0.0f);
-
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 4); x += 4)
- {
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
-
- // Get power of 2 in case of l2 pooling and accumulate
- if(pool_info.pool_type == PoolingType::L2)
- {
- vres = vmlaq_f32(vres, data, data);
- }
- else
- {
- vres = vaddq_f32(vres, data);
- }
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- data *= data;
- }
-
- res += data;
- }
- }
-
-#if defined(__aarch64__)
- // Reduction operation available on 64 bit architectures only
- res += vaddvq_f32(vres);
-#else // __aarch64__
- // Reduction
- float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
- tmp = vpadd_f32(tmp, tmp);
-
- res += vget_lane_f32(tmp, 0);
-#endif // __aarch64__
- // Divide by scale
- res *= scale;
- }
- else
- {
- float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
- res = std::numeric_limits<float>::lowest();
-
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 4); x += 4)
- {
- const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- vres = vmaxq_f32(vres, data);
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- res = std::max(res, data);
- }
- }
-#if defined(__aarch64__)
- // Reduction operation available on 64 bit architectures only
- res = std::max(vmaxvq_f32(vres), res);
-#else // __aarch64__
- float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
- tmp = vpmax_f32(tmp, tmp);
-
- res = std::max(res, vget_lane_f32(tmp, 0));
-#endif // __aarch64__
- }
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- res = std::sqrt(res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = res;
- },
- in, out);
-}
-
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- if(pool_info.pool_type == PoolingType::MAX && dst1)
- {
- pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
- }
- else
- {
- Iterator in(src, window_src);
- Iterator out(dst0, window);
- constexpr int pool_size = 2;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset());
- const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
- float32x2_t top_data = vld1_f32(in_top_ptr);
- float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
- float32x2_t res = {};
- float final_res = 0;
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- top_data = vmul_f32(top_data, top_data);
- bottom_data = vmul_f32(bottom_data, bottom_data);
- }
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
- res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
- }
- else
- {
- const float32x2_t max_data = vmax_f32(top_data, bottom_data);
- res = vpmax_f32(max_data, max_data);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = final_res;
- },
- in, out);
- }
-}
-
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- constexpr const int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
- const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
- const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + in.offset()));
- float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + in.offset()));
- float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + in.offset()));
- float32x2_t res = {};
- float final_res = 0;
-
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- top_data = vmulq_f32(top_data, top_data);
- middle_data = vmulq_f32(middle_data, middle_data);
- bottom_data = vmulq_f32(bottom_data, bottom_data);
- }
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
- res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = final_res;
- },
- in, out);
-}
-
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- constexpr const int pool_size = 7;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- std::array<const uint8_t *, pool_size> src_ptrs{ {} };
- for(int i = 0; i < pool_size; ++i)
- {
- src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
- }
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- float32x2_t res = {};
- float final_res = 0.f;
- if(pool_info.pool_type != PoolingType::MAX)
- {
- // Calculate scale
- float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
- const float32x2_t scale_v = vdup_n_f32(scale);
-
- // Perform pooling
- float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- data.val[0] = vmulq_f32(data.val[0], data.val[0]);
- data.val[1] = vmulq_f32(data.val[1], data.val[1]);
- }
- float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
- for(int i = 1; i < pool_size; ++i)
- {
- data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
- // Get power of 2 in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- data.val[0] = vmulq_f32(data.val[0], data.val[0]);
- data.val[1] = vmulq_f32(data.val[1], data.val[1]);
- }
- sum_data = vaddq_f32(sum_data, data.val[0]);
- sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
- }
- res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
- res = vmul_f32(vpadd_f32(res, res), scale_v);
- }
- else
- {
- float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
- for(int i = 1; i < pool_size; ++i)
- {
- const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
- max_data = vmax2q_f32(max_data, data);
- }
- res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
- res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
- res = vpmax_f32(res, res);
- }
- final_res = vget_lane_f32(res, 0);
-
- // Calculate square-root in case of l2 pooling
- if(pool_info.pool_type == PoolingType::L2)
- {
- final_res = sqrt(final_res);
- }
-
- // Store result
- *(reinterpret_cast<float *>(out.ptr())) = final_res;
- },
- in, out);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_NCHW_KERNELS \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/qasymm8.cpp b/src/core/cpu/kernels/pooling/neon/qasymm8.cpp
deleted file mode 100644
index af62ede13f..0000000000
--- a/src/core/cpu/kernels/pooling/neon/qasymm8.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp
deleted file mode 100644
index 2c4b095225..0000000000
--- a/src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/cpu/kernels/pooling/neon/list.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/pooling/neon/quantized.h b/src/core/cpu/kernels/pooling/neon/quantized.h
deleted file mode 100644
index a16960a205..0000000000
--- a/src/core/cpu/kernels/pooling/neon/quantized.h
+++ /dev/null
@@ -1,863 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
- return quantize_qasymm8_signed(val, info);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
-quantize(float val, const UniformQuantizationInfo &info)
-{
- return quantize_qasymm8(val, info);
-}
-
-template <typename T>
-inline T vcvtq_q32_f32(float32x4_t values);
-
-template <>
-inline uint32x4_t vcvtq_q32_f32(float32x4_t values)
-{
- return vcvtq_u32_f32(values);
-}
-
-template <>
-inline int32x4_t vcvtq_q32_f32(float32x4_t values)
-{
- return vcvtq_s32_f32(values);
-}
-
-template <typename T>
-inline float32x4_t vcvtq_f32_q32(T values);
-
-template <>
-inline float32x4_t vcvtq_f32_q32(uint32x4_t values)
-{
- return vcvtq_f32_u32(values);
-}
-
-template <>
-inline float32x4_t vcvtq_f32_q32(int32x4_t values)
-{
- return vcvtq_f32_s32(values);
-}
-
-template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
-
-template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
- const float new_scale = quant_rescale / scale_pooling;
- return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
-{
- const float new_scale = quant_rescale / scale_pooling;
- return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
-}
-
-template <typename Tin, typename Tout>
-inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
- const float32x4x4_t acc =
- {
- {
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
- }
- };
- return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
-{
- const float32x4x4_t acc =
- {
- {
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
- }
- };
- return vquantize_signed(acc, requant_qinfo);
-}
-
-template <typename T>
-inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo);
-
-template <>
-inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
- const float32x4x2_t acc =
- {
- {
- vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
- vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
- }
- };
- return vquantize(acc, requant_qinfo);
-}
-
-template <>
-inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
-{
- const float32x4x2_t acc =
- {
- {
- vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
- vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
- }
- };
- return vquantize_signed(acc, requant_qinfo);
-}
-
-inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
- const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
- int start_x = id[idx_width] * stride_x - pad_x;
- int start_y = id[idx_height] * stride_y - pad_y;
-
- const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
- const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
- if(exclude_padding)
- {
- start_x = std::max(0, start_x);
- start_y = std::max(0, start_y);
- }
- return 1.f / ((end_y - start_y) * (end_x - start_x));
-}
-
-template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
-
- const int window_start_x = window.x().start();
- const int window_end_x = window.x().end();
- const int window_step_x = 16;
- const int window_half_step_x = window_step_x / 2;
-
- Window window_out = window;
- window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator in(src, window_src);
- Iterator out(dst0, window_out);
-
- using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
- using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
- using q16_t = typename wrapper::traits::promote_t<T>;
- using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
- using q32_t = typename wrapper::traits::promote_t<q16_t>;
- using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
- const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
-
- const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
- // "new_offset" doesn't have to consider the "half_scale_v" in its computation
- // With a requantization performed in a single step there won't be uncertainties introduced
- const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
-
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
-
- execute_window_loop(window_out, [&](const Coordinates & id)
- {
- const int idx_width = id.y() * pool_stride_x;
- const int idx_height = id.z() * pool_stride_y;
- const int pool_limit_y = pool_pad_top - idx_height;
- const int pool_limit_x = pool_pad_left - idx_width;
-
- const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
- const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
- const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
- const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
- int x_off = window_start_x;
- for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
- {
- if(pool_info.pool_type != PoolingType::MAX)
- {
- q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
-
- const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
- const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
- vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
- vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
- vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
- vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
- }
- }
-
- if(src_qinfo != dst_qinfo)
- {
- const float32x4x4_t vres =
- {
- {
- vcvtq_f32_q32(vres1),
- vcvtq_f32_q32(vres2),
- vcvtq_f32_q32(vres3),
- vcvtq_f32_q32(vres4),
- }
- };
- const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
- }
- else
- {
- const float32x4_t scale_v = vdupq_n_f32(scale);
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
- vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
- vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
- vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
- const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
- const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
- }
- }
- else
- {
- q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = wrapper::vmax(vres, data);
- }
- }
-
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
- requant_qinfo) :
- vres);
- }
- }
-
- if(pool_info.pool_type == PoolingType::MAX)
- {
- for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
- {
- q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- vres = wrapper::vmax(vres, data);
- }
- }
-
- // Store result
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
- (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
- }
- }
-
- // Left-overs loop
- for(; x_off < window_end_x; ++x_off)
- {
- if(pool_info.pool_type != PoolingType::MAX)
- {
- q32_t res = static_cast<q32_t>(0.f);
-
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res += data;
- }
- }
-
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- const float new_scale = quant_rescale / scale;
- const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
-
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
- }
- else
- {
- // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
- res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
- // Store result
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- }
- else
- {
- T res = std::numeric_limits<T>::min();
-
- for(int y = pool_start_y; y < pool_end_y; ++y)
- {
- for(int x = pool_start_x; x < pool_end_x; ++x)
- {
- const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().z())) + x_off);
- res = std::max(res, data);
- }
- }
-
- // Store result
- if(src_qinfo != dst_qinfo)
- {
- const float res_f = static_cast<float>(res);
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
- }
- else
- {
- *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
- }
- }
- }
-
- },
- in, out);
-}
-
-#if defined(ENABLE_NCHW_KERNELS)
-template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
- const int pool_size, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
- int start_x = (id.x() + id_offset) * stride_x - pad_x;
- int start_y = id.y() * stride_y - pad_y;
- const int end_y = std::min(start_y + pool_size, upper_bound_h);
- if(exclude_padding)
- {
- start_y = std::max(0, start_y);
- }
-
- std::array<T, 8> elems =
- {
- {
- wrapper::vgetlane(v, 0),
- wrapper::vgetlane(v, 1),
- wrapper::vgetlane(v, 2),
- wrapper::vgetlane(v, 3),
- wrapper::vgetlane(v, 4),
- wrapper::vgetlane(v, 5),
- wrapper::vgetlane(v, 6),
- wrapper::vgetlane(v, 7),
- }
- };
-
- for(auto &el : elems)
- {
- int c_start_x = start_x;
- const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
- if(exclude_padding)
- {
- c_start_x = std::max(0, c_start_x);
- }
- float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
- el *= scale;
- start_x += step * stride_x;
- }
-
- v = wrapper::vsetlane(elems[0], v, 0);
- v = wrapper::vsetlane(elems[1], v, 1);
- v = wrapper::vsetlane(elems[2], v, 2);
- v = wrapper::vsetlane(elems[3], v, 3);
- v = wrapper::vsetlane(elems[4], v, 4);
- v = wrapper::vsetlane(elems[5], v, 5);
- v = wrapper::vsetlane(elems[6], v, 6);
- v = wrapper::vsetlane(elems[7], v, 7);
-}
-
-template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- /** SIMD vector types */
- using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
- using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
- using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
- using q16_t = typename wrapper::traits::promote_t<T>;
- using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
- using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
- using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
- constexpr int pool_size = 2;
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-
- const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
-
- const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
- const bool have_different_qinfo = src_qinfo != dst_qinfo;
-
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = wrapper::vloadq(src_top_ptr + in.offset());
- const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
- q8x8_t lower_res = {};
- q8x8_t upper_res = {};
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
- const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
- // Add rows
- const q16x8x2_t vrsum =
- {
- {
- wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
- wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
- }
- };
-
- // Pair-wise add row data
- const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
- const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
-
- q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
-
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- lower_res = wrapper::vmovn(res_lower);
-
- // Compute upper result for stride_x == 1
- if(pool_stride_x == 1)
- {
- // Shifted row sum
- const q16x8x2_t vrsum_shifted =
- {
- {
- wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_1(vrsum.val[1], vrsum.val[1])
- }
- };
-
- // Pair-wise add shifted row
- q16x8_t res_upper = wrapper::vcombine(
- wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
- wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
-
- // Scale upper result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- upper_res = wrapper::vmovn(res_upper);
- }
- }
- else
- {
- const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
- lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
- if(pool_stride_x == 1)
- {
- const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
- upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
- }
- }
-
- if(have_different_qinfo)
- {
- const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
- lower_res = wrapper::vgetlow(requantized_dst);
- upper_res = wrapper::vgethigh(requantized_dst);
- }
-
- // Store result
- if(pool_stride_x == 1)
- {
- const q8x8x2_t res = { { lower_res, upper_res } };
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()), res);
- }
- else
- {
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()), lower_res);
- }
- },
- in, out);
-}
-
-template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- /** SIMD vector types */
- using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
- using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
- using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
- using q16_t = typename wrapper::traits::promote_t<T>;
- using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
- using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
-
- constexpr int pool_size = 3;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
- const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
- const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
- const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
-
- const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
- const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
- const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto top_data = wrapper::vloadq(src_top_ptr + in.offset());
- const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset());
- const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset());
- q8x8_t fres = {};
- q8x16_t fqres = {};
-
- if(pool_info.pool_type == PoolingType::AVG)
- {
- // Convert data to u16
- const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
- const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
- const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
-
- // Calculate row sums
- const q16x8x2_t vrsum =
- {
- {
- wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
- wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
- }
- };
- const q16x8x2_t vrsum_shifted_1 =
- {
- {
- wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_1(vrsum.val[1], vrsum.val[1])
- }
- };
- const q16x8x2_t vrsum_shifted_2 =
- {
- {
- wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
- wrapper::vext_2(vrsum.val[1], vrsum.val[1])
- }
- };
- // Calculate final sum
- q16x8x2_t final_sum =
- {
- {
- wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
- wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
- }
- };
- if(pool_stride_x == 2)
- {
- q16x8_t res =
- {
- wrapper::vgetlane(final_sum.val[0], 0),
- wrapper::vgetlane(final_sum.val[0], 2),
- wrapper::vgetlane(final_sum.val[0], 4),
- wrapper::vgetlane(final_sum.val[0], 6),
- wrapper::vgetlane(final_sum.val[1], 0),
- wrapper::vgetlane(final_sum.val[1], 2),
- wrapper::vgetlane(final_sum.val[1], 4),
- wrapper::vgetlane(final_sum.val[1], 6),
- };
-
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- fres = wrapper::vmovn(res);
- }
- else
- {
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- // Scale lower result
- scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
- pool_size, upper_bound_w, upper_bound_h,
- pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
- fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
- }
- }
- else
- {
- const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
- const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
- const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
- const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
-
- if(pool_stride_x == 2)
- {
- const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
- static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
- fres = wrapper::vtbl(table, lookup_val);
- }
- else
- {
- fqres = final_max;
- }
- }
-
- // Store result
- if(pool_stride_x == 1)
- {
- if(src_qinfo != dst_qinfo)
- {
- fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
- }
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fqres);
- }
- else
- {
- if(src_qinfo != dst_qinfo)
- {
- fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
- }
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fres);
- }
- },
- in, out);
-}
-
-template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
- ARM_COMPUTE_UNUSED(dst1);
- Iterator in(src, window_src);
- Iterator out(dst0, window);
-
- /** SIMD vector types */
- using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
- using q16_t = typename wrapper::traits::promote_t<T>;
- using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
- using q32_t = typename wrapper::traits::promote_t<q16_t>;
- using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
-
- const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
- const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
- const int pool_pad_right = pool_info.pad_stride_info.pad_right();
- const int pool_pad_top = pool_info.pad_stride_info.pad_top();
- const int pool_pad_left = pool_info.pad_stride_info.pad_left();
- const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
- int pool_stride_x = 0;
- int pool_stride_y = 0;
- std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
- const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
- const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-
- const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
-
- execute_window_loop(window, [&](const Coordinates & id)
- {
- T res = std::numeric_limits<T>::min();
-
- if(pool_info.pool_type != PoolingType::MAX)
- {
- q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
- q32_t sres = 0;
-
- // Calculate scale
- const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
- pool_stride_y);
-
- // Perform pooling
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 8); x += 8)
- {
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
-
- const q16x8_t data_q16 = wrapper::vmovl(data);
- vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16)));
- }
-
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- sres += data;
- }
- }
-
- // Reduction
- const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres));
- sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1);
-
- // Divide by scale
- res = static_cast<T>(support::cpp11::round(sres * scale));
- }
- else
- {
- q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-
- for(int y = 0; y < pool_size_y; ++y)
- {
- int x = 0;
- for(; x <= (pool_size_x - 8); x += 8)
- {
- const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- vres = wrapper::vmax(vres, data);
- }
- // Leftover for loop
- for(; x < pool_size_x; ++x)
- {
- const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
- (src->info()->strides_in_bytes().y())));
- res = std::max(res, data);
- }
- }
-
- // Reduce max
- vres = wrapper::vpmax(vres, vres);
- vres = wrapper::vpmax(vres, vres);
- vres = wrapper::vpmax(vres, vres);
-
- // Get max value
- res = std::max(res, wrapper::vgetlane(vres, 0));
- }
- // Store result
- res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
- *(reinterpret_cast<T *>(out.ptr())) = res;
- },
- in, out);
-}
-#endif /* defined(ENABLE_NCHW_KERNELS) */
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
diff --git a/src/core/cpu/kernels/scale/neon/fp16.cpp b/src/core/cpu/kernels/scale/neon/fp16.cpp
deleted file mode 100644
index 0ad66cab1c..0000000000
--- a/src/core/cpu/kernels/scale/neon/fp16.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int window_step_x = 8;
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
- {
- *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
-}
-
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
- const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/integer.cpp b/src/core/cpu/kernels/scale/neon/integer.cpp
deleted file mode 100644
index a2359aac94..0000000000
--- a/src/core/cpu/kernels/scale/neon/integer.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int window_step_x = 16;
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
- {
- *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
-}
-
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int window_step_x = 8;
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
- {
- *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
-}
-
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/list.h b/src/core/cpu/kernels/scale/neon/list.h
deleted file mode 100644
index c91242f5b2..0000000000
--- a/src/core/cpu/kernels/scale/neon/list.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name) \
- void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
- bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(qasymm8_neon_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale);
-
-#undef DECLARE_SCALE_KERNEL
-
-template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
- bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
- const int window_step_x = 16 / sizeof(T);
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- int32_t x = window_start_x;
- const T *in_ptr = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-
- for(; x <= window_end_x - window_step_x; x += window_step_x)
- {
- wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
- wrapper::vloadq(in_ptr + offset + offset_row + x));
- }
- for(; x < window_end_x; ++x)
- {
- *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
- }
- },
- out);
-}
-
-template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
-#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- using ConstType = T;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const T *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8.cpp b/src/core/cpu/kernels/scale/neon/qasymm8.cpp
deleted file mode 100644
index 90302ce889..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Data layout is NHWC
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- Window win_off;
- win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(1, Window::Dimension(0, 0, 0));
- win_in.set(2, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator in(src, win_in);
- Iterator out(dst, window);
-
- const int32_t in_dim_w = src->info()->dimension(1);
- const int32_t in_dim_h = src->info()->dimension(2);
- const int32_t stride_w = src->info()->strides_in_bytes()[1];
- const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
- const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
deleted file mode 100644
index 07d6c6ef03..0000000000
--- a/src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/scale/neon/list.h"
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Data layout is NHWC
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
- Window win_off;
- win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(1, Window::Dimension(0, 0, 0));
- win_in.set(2, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator in(src, win_in);
- Iterator out(dst, window);
-
- const int32_t in_dim_w = src->info()->dimension(1);
- const int32_t in_dim_h = src->info()->dimension(2);
- const int32_t stride_w = src->info()->strides_in_bytes()[1];
- const int32_t stride_h = src->info()->strides_in_bytes()[2];
-
- const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
- const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp
deleted file mode 100644
index 5b9377c6e6..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp16.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- // Store results
- svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- out);
-}
-
-void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
-
- const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp
deleted file mode 100644
index 05fbedf20d..0000000000
--- a/src/core/cpu/kernels/scale/sve/fp32.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <cmath>
-#include <cstddef>
-
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace
-{
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
- {
- // Store results
- svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- out);
-}
-
-void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const float const_border_value = static_cast<float>(constant_border_value.get<float>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const float *in_ptr = reinterpret_cast<const float *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp
deleted file mode 100644
index d7e270c661..0000000000
--- a/src/core/cpu/kernels/scale/sve/integer.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- // Store results
- svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
-}
-
-void u8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
- {
- // Store results
- svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- out);
-}
-
-void s16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
-
- Iterator out(dst, window);
- const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const int in_dim_w = src->info()->dimension(1);
- const int in_dim_h = src->info()->dimension(2);
- const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
-
- // Don't increment in Y and Z direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
- win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(src, win_in);
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
- const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
-
- const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
- const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
- const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
- const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
- const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
- const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-
- auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
- const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
- const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
- const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
-
- *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- u8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- s16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/list.h b/src/core/cpu/kernels/scale/sve/list.h
deleted file mode 100644
index b9c3a10a78..0000000000
--- a/src/core/cpu/kernels/scale/sve/list.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SCALE_KERNEL(func_name) \
- void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
- bool align_corners, const Window &window)
-
-DECLARE_SCALE_KERNEL(fp16_sve_scale);
-DECLARE_SCALE_KERNEL(fp32_sve_scale);
-DECLARE_SCALE_KERNEL(s16_sve_scale);
-DECLARE_SCALE_KERNEL(u8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_sve_scale);
-DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale);
-
-#undef DECLARE_SCALE_KERNEL
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
deleted file mode 100644
index f747037938..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- // Store results
- svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
-}
-
-void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Data layout is NHWC
- const int idx_width = 1;
- const int idx_height = 2;
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
- Window win_off;
- win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(idx_width, Window::Dimension(0, 0, 0));
- win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator in(src, win_in);
- Iterator out(dst, window);
-
- const int32_t in_dim_w = src->info()->dimension(idx_width);
- const int32_t in_dim_h = src->info()->dimension(idx_height);
- const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
- const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
- const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // defined(ENABLE_SVE) \ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
deleted file mode 100644
index 584ec7a0da..0000000000
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/utils/ScaleUtils.h"
-#include "support/Rounding.h"
-
-#include <arm_sve.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace
-{
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
- float sampling_offset, bool align_corners, const Window &window)
-{
- const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
- const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
- const size_t in_stride_wc = in_stride_w * in_stride_c;
- const size_t in_dim_h = src->info()->dimension(2);
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
- const auto window_start_x = static_cast<int32_t>(window.x().start());
- const auto window_end_x = static_cast<int32_t>(window.x().end());
-
- Window win(window);
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator out(dst, win);
-
- const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
- const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
-
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
- const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
- const int offset_row = in_hi * in_stride_wc;
- const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
- const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b8(x, window_end_x);
- do
- {
- // Store results
- svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
-
- x += svcntw();
- pg = svwhilelt_b8(x, window_end_x);
- }
- while(svptest_any(svptrue_b8(), pg));
- },
- out);
-}
-
-void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- // Data layout is NHWC
- const int idx_width = 1;
- const int idx_height = 2;
-
- // Compute the ratio between source height and destination height
- const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
- Window win_off;
- win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
- win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- // Don't increment in X and Y direction for the input tensor
- // A pointer to the start of this plane is needed as base for the precomputed offsets
- Window win_in(window);
- win_in.set(idx_width, Window::Dimension(0, 0, 0));
- win_in.set(idx_height, Window::Dimension(0, 0, 0));
-
- for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
- {
- win_off.set(d, Window::Dimension(0, 0, 0));
- }
-
- Iterator in(src, win_in);
- Iterator out(dst, window);
-
- const int32_t in_dim_w = src->info()->dimension(idx_width);
- const int32_t in_dim_h = src->info()->dimension(idx_height);
- const int32_t stride_w = src->info()->strides_in_bytes()[idx_width];
- const int32_t stride_h = src->info()->strides_in_bytes()[idx_height];
-
- const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- if(border_mode == BorderMode::CONSTANT)
- {
- const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
- const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
- const_border_value;
- const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
- const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
- (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
- const_border_value;
-
- const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else if(border_mode == BorderMode::REPLICATE)
- {
- execute_window_loop(window, [&](const Coordinates & id)
- {
- const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset);
- const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
- const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
-
- auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1);
- auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
- auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1);
- auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
-
- const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
- const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
- const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
- const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
-
- const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
- const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
- const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
- const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
- *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
- },
- in, out);
- }
- else
- {
- ARM_COMPUTE_ERROR("Not implemented");
- }
-}
-}
-namespace cpu
-{
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
- InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
- bool align_corners, const Window &window)
-{
- if(policy == InterpolationPolicy::BILINEAR)
- {
- qasymm8_signed_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
- }
- else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
- {
- qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif // ENABLE_SVE \ No newline at end of file
diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h
deleted file mode 100644
index 5ebee31272..0000000000
--- a/src/core/cpu/kernels/softmax/impl/neon/list.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(in, win);
- Iterator output(out, win);
-
- const int sum_stages = log2(window_step_x / 2);
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Init max value
- auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
- int x = window_start_x;
-
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto current_value = wrapper::vloadq(in_ptr + x);
- vec_max = wrapper::vmax(vec_max, current_value);
- }
- auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
- for(int i = 0; i < sum_stages; ++i)
- {
- carry_max = wrapper::vpmax(carry_max, carry_max);
- }
- T max_val = wrapper::vgetlane(carry_max, 0);
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
- }
-
- *out_ptr = max_val;
- },
- input, output);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
-{
- static_assert(std::is_same<T, qasymm8_t>::value
- || std::is_same<T, qasymm8_signed_t>::value,
- "quantized type should be either qasymm8_t or qasymm8_signed_t.");
-
- const int start_x = in->info()->valid_region().anchor.x();
- const int input_width = in->info()->valid_region().shape.x();
-
- const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
- const auto scale_beta_vec = vdupq_n_f32(scale_beta);
-
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
- constexpr int vec_size = 16;
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
- float sum{};
- float sum_inversed{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
-
- /* Init sum to zero */
- float32x4x4_t vec_sum =
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f),
- };
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vqsub(vec_max, vec_elements);
- auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
- if(is_log)
- {
- vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
- vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
- vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
- vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
- }
- else
- {
- vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
- vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
- vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
- vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
- vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
- vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
- vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
- vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
- }
-
- vst4q_f32(tmp_ptr + x, vec_elements_flt);
- }
-
- /* Reduce sum */
- const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
- auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
- sum_res = vpadd_f32(sum_res, sum_res);
- sum = wrapper::vgetlane(sum_res, 0);
-
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- float element{};
- if(is_log)
- {
- element = (max_val - in_ptr[x]) * scale_beta;
- sum += std::exp(element);
- }
- else
- {
- element = std::exp((max_val - in_ptr[x]) * scale_beta);
- sum += element;
- }
-
- tmp_ptr[x] = element;
- }
-
- if(!is_log)
- {
- sum_inversed = 256.f / sum;
- }
- else
- {
- sum = std::log(sum);
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
- float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
- int_vec_type normalized_value{};
- if(is_log)
- {
- const float32x4x4_t sub =
- {
- vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
- vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
- };
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
- }
- else
- {
- float32x4x4_t mul =
- {
- vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
- vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
- };
-
- if(is_qasymm8_signed)
- {
- const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
- mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
- mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
- mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
- mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
- }
-
- normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
- }
- else
- {
- out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
- }
- }
- }
- },
- in_it, max_it, out_it);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
-{
- const int start_x = in->info()->valid_region().anchor.x();
- const int input_width = in->info()->valid_region().shape.x();
-
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
-
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- constexpr int vec_size = 16 / sizeof(T);
- const int sum_stages = log2(vec_size / 2);
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
- T sum{};
- T sum_inversed{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
- const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
- /* Init sum to zero */
- auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_elements = wrapper::vloadq(in_ptr + x);
- vec_elements = wrapper::vsub(vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
- vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
- }
- else
- {
- vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
- vec_sum = wrapper::vadd(vec_sum, vec_elements);
- }
- wrapper::vstore(tmp_ptr + x, vec_elements);
- }
-
- /* Reduce sum */
- auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
- for(int i = 0; i < sum_stages; ++i)
- {
- sum_res = wrapper::vpadd(sum_res, sum_res);
- }
- sum = wrapper::vgetlane(sum_res, 0);
-
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- T element{};
-
- if(is_log)
- {
- element = (in_ptr[x] - max_val) * beta;
- sum += std::exp(element);
- }
- else
- {
- element = std::exp((in_ptr[x] - max_val) * beta);
- sum += element;
- }
- tmp_ptr[x] = element;
- }
-
- if(!is_log)
- {
- sum_inversed = T(1) / sum;
- }
- else
- {
- sum = static_cast<T>(std::log(sum));
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- for(; x <= (input_width - vec_size); x += vec_size)
- {
- auto vec_in = wrapper::vloadq(tmp_ptr + x);
- auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
- if(is_log)
- {
- normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
- }
- else
- {
- normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
- }
- wrapper::vstore(out_ptr + x, normalized_value);
- }
- /* Run remaining elements */
- for(; x < input_width; ++x)
- {
- if(is_log)
- {
- out_ptr[x] = tmp_ptr[x] - sum;
- }
- else
- {
- out_ptr[x] = tmp_ptr[x] * sum_inversed;
- }
- }
- }
- },
- in_it, max_it, out_it);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
deleted file mode 100644
index 4ed5a4fbea..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
- const auto all_true_pg = wrapper::svptrue<ScalarType>();
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- Window win{ window };
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- Iterator input(in, win);
- Iterator output(out, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- // Get pointers
- const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
- const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
- // Init max value
- auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
- int x = window_start_x;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- do
- {
- const auto current_value = svld1(pg, in_ptr + x);
- vec_max = svmax_m(pg, vec_max, current_value);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
- }
- while(svptest_any(all_true_pg, pg));
-
- auto max_val = svmaxv(all_true_pg, vec_max);
-
- *out_ptr = max_val;
- },
- input, output);
-}
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window)
-{
- const int start_x = in->info()->valid_region().anchor.x();
- const int input_width = in->info()->valid_region().shape.x();
-
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
-
- const auto all_true_pg = wrapper::svptrue<ScalarType>();
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
- ScalarType sum{ 0 };
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
-
- /* Init sum to zero */
- auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
- {
- auto vec_elements = svld1(pg, in_ptr + x);
- vec_elements = svsub_z(pg, vec_elements, vec_max);
- if(is_log)
- {
- vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
- vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
- }
- else
- {
- vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
- vec_sum = svadd_m(pg, vec_sum, vec_elements);
- }
- svst1(pg, tmp_ptr + x, vec_elements);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- }
- while(svptest_any(all_true_pg, pg));
-
- /* Reduce sum */
- sum = svaddv(all_true_pg, vec_sum);
-
- if(is_log)
- {
- sum = static_cast<ScalarType>(std::log(sum));
- }
- else
- {
- sum = ScalarType(1) / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- do
- {
- auto vec_in = svld1(pg, tmp_ptr + x);
- auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
- if(is_log)
- {
- normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- else
- {
- normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
- }
- svst1(pg, out_ptr + x, normalized_value);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- }
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
-}
-
-template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
-template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
-
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h
deleted file mode 100644
index 7ddb358b8e..0000000000
--- a/src/core/cpu/kernels/softmax/impl/sve/list.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-
-#if defined(ENABLE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, const float beta, bool is_log, const Window &window);
-
-#if defined(__ARM_FEATURE_SVE2)
-template <typename ScalarType>
-void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
- ITensor *out, float beta, bool is_log, const Window &window)
-{
- const int start_x = in->info()->valid_region().anchor.x();
- const int input_width = in->info()->valid_region().shape.x();
-
- const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
- const auto scale_beta_vec = svdup_n_f32(scale_beta);
-
- Iterator in_it(in, window);
- Iterator max_it(max, window);
- Iterator out_it(out, window);
- const auto all_true_pg = wrapper::svptrue<ScalarType>();
- using SVEType = typename wrapper::traits::sve_vector<ScalarType>::type;
-
- const int inc_1 = static_cast<int>(svcntw());
- const int inc_2 = static_cast<int>(2 * svcntw());
- const int inc_3 = static_cast<int>(3 * svcntw());
-
- execute_window_loop(window, [&](const Coordinates &)
- {
- /* Get pointers */
- const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
- const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
- const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
- float sum{};
-
- /* Compute exponentials and sum */
- {
- /* Get max value */
- const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
- const auto vec_max = wrapper::svdup_n(max_val);
-
- /* Init sum to zero */
- auto vec_sum_0 = svdup_n_f32(0.f);
- auto vec_sum_1 = svdup_n_f32(0.f);
- auto vec_sum_2 = svdup_n_f32(0.f);
- auto vec_sum_3 = svdup_n_f32(0.f);
-
- /* Loop over row and compute exponentials and sum */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
- {
- auto vec_elements = svld1(pg, in_ptr + x);
- vec_elements = svsub_z(pg, vec_max, vec_elements);
-
- auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements)));
- auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements)));
- auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements)));
- auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements)));
-
- if(is_log)
- {
- vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
- vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
- vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
- vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
- }
- else
- {
- vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
- vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
- vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
- vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
- vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
- vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
- vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
- vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
- }
-
- svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
- svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
- svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
- svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
- }
- while(svptest_any(all_true_pg, pg));
-
- /* Reduce sum */
- const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
- sum = svaddv_f32(all_true_pg, vec_sum);
-
- /* Run remaining elements */
- x = 0;
- if(is_log)
- {
- sum = std::log(sum);
- }
- else
- {
- sum = 256.f / sum;
- }
- }
-
- /* Normalize exponentials */
- {
- constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
- /* Loop over row and compute softmax */
- int x = 0;
- svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- svbool_t pg_0 = svunpklo(svunpklo(pg));
- svbool_t pg_1 = svunpkhi(svunpklo(pg));
- svbool_t pg_2 = svunpklo(svunpkhi(pg));
- svbool_t pg_3 = svunpkhi(svunpkhi(pg));
- do
- {
- auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
- auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
- auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
- auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
- svfloat32_t res_0{};
- svfloat32_t res_1{};
- svfloat32_t res_2{};
- svfloat32_t res_3{};
-
- if(is_log)
- {
- res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
- }
- else
- {
- res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
- res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
- res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
- res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-
- if(is_qasymm8_signed)
- {
- const auto offset_vec = svdup_n_f32(128.f);
- res_0 = svsub_z(pg_0, vec_in_0, offset_vec);
- res_1 = svsub_z(pg_1, vec_in_1, offset_vec);
- res_2 = svsub_z(pg_2, vec_in_2, offset_vec);
- res_3 = svsub_z(pg_3, vec_in_3, offset_vec);
- }
- }
-
- // Store value
- const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
- svst1(pg, out_ptr + x, out);
- x += wrapper::svcnt<ScalarType>();
- pg = wrapper::svwhilelt<ScalarType>(x, input_width);
- pg_0 = svunpklo(svunpklo(pg));
- pg_1 = svunpkhi(svunpklo(pg));
- pg_2 = svunpklo(svunpkhi(pg));
- pg_3 = svunpkhi(svunpkhi(pg));
- }
- while(svptest_any(all_true_pg, pg));
- }
- },
- in_it, max_it, out_it);
-}
-#endif /* defined(__ARM_FEATURE_SVE2) */
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(ENABLE_SVE) */
-
-#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/sub/neon/integer.cpp b/src/core/cpu/kernels/sub/neon/integer.cpp
deleted file mode 100644
index bba73df1e8..0000000000
--- a/src/core/cpu/kernels/sub/neon/integer.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void sub_s16_u8_s16_impl(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_swapped)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = wrapper::vloadq(input1_ptr + x);
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto res = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
- *(output_ptr + x) = res;
- }
- }
- else
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = wrapper::vloadq(input1_ptr + x);
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto res = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
- *(output_ptr + x) = res;
- }
- }
- },
- input1, input2, output);
-}
-}
-
-void sub_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- sub_s16_u8_s16_impl(src1, src0, dst, policy, window, false);
-}
-
-void sub_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Swap arguments
- sub_s16_u8_s16_impl(src1, src0, dst, policy, window, true);
-}
-
-void sub_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- // Create input windows
- Window win = window;
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- if(policy == ConvertPolicy::WRAP)
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
- }
- }
- else
- {
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
- const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
- wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
- static_cast<int16_t>(*(input2_ptr + x)));
- }
- }
- },
- input1, input2, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h
deleted file mode 100644
index 1ab4e6367b..0000000000
--- a/src/core/cpu/kernels/sub/neon/list.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H
-#define SRC_CORE_NEON_KERNELS_SUB_LIST_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-#define DECLARE_SUB_KERNEL(func_name) \
- void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-
-DECLARE_SUB_KERNEL(sub_qasymm8_neon);
-DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
-DECLARE_SUB_KERNEL(sub_qsymm16_neon);
-DECLARE_SUB_KERNEL(sub_s16_u8_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_s16_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon);
-
-#undef DECLARE_SUB_KERNEL
-
-template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- /** SIMD vector tag type. */
- using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
- bool is_sat = policy == ConvertPolicy::SATURATE;
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- constexpr int window_step_x = 16 / sizeof(T);
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
- Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
- Iterator output(dst, window);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
- if(is_broadcast_input_2)
- {
- res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
- }
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
- auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
- if(is_broadcast_input_2)
- {
- res = static_cast<T>(-1) * res;
- }
-
- *(output_ptr + x) = res;
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto val1 = wrapper::vloadq(input1_ptr + x);
- const auto val2 = wrapper::vloadq(input2_ptr + x);
- const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto val1 = *(input1_ptr + x);
- const auto val2 = *(input2_ptr + x);
- *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8.cpp b/src/core/cpu/kernels/sub/neon/qasymm8.cpp
deleted file mode 100644
index 8f4cd8bdbb..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
- const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
- const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
- const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
- const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- const auto broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(static_cast<uint8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
- const float32x4x4_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
- }
- };
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
- const float32x4x4_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- }
- };
-
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64_
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
- const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
- wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
- const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
-
- const float32x4x4_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- }
- };
-
- const float32x4x4_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
- }
- };
-
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
- const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
- wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
- *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
deleted file mode 100644
index 2c9e411743..0000000000
--- a/src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 16;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
- const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
- const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
- const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
- const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
- const auto broadcast_value_vec = wrapper::vdup_n(static_cast<int8_t>(broadcast_value), wrapper::traits::vector_128_tag{});
-
- const float32x4x4_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
- }
- };
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
-
- const float32x4x4_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- }
- };
-
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64_
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
- const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
- wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info());
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
- const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
-
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto a = wrapper::vloadq(input1_ptr + x);
- const auto b = wrapper::vloadq(input2_ptr + x);
-
- const float32x4x4_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
- }
- };
-
- const float32x4x4_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
- vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
- }
- };
-
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
- const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
- wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
-
- *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/qsymm16.cpp b/src/core/cpu/kernels/sub/neon/qsymm16.cpp
deleted file mode 100644
index 4dfdc0e78c..0000000000
--- a/src/core/cpu/kernels/sub/neon/qsymm16.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
- ARM_COMPUTE_UNUSED(policy);
-
- // Create input windows
- Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
- Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
- // Clear X Dimension on execution window as we handle manually
- Window win = window;
- win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- const int window_step_x = 8;
- const auto window_start_x = static_cast<int>(window.x().start());
- const auto window_end_x = static_cast<int>(window.x().end());
- const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-
- const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
- const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
- const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
-
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
- const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-
- if(is_broadcast_across_x)
- {
- const bool is_broadcast_input_2 = input2_win.x().step() == 0;
- Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
- Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
- const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
- const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
- const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
- const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
-
- // Clear X Dimension on execution window as we handle manually
- non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator broadcast_input(broadcast_tensor, broadcast_win);
- Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
- const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
- const float32x4x2_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
- }
- };
- const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
- const float32x4x2_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
- }
- };
-
- const int32x4x4_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
- }
- },
- broadcast_input, non_broadcast_input, output);
- }
- else
- {
- // Clear X Dimension on execution window as we handle manually
- input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
- input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
- Iterator input1(src0, input1_win);
- Iterator input2(src1, input2_win);
- Iterator output(dst, win);
-
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
- const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
- const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const int16x8_t a = vld1q_s16(input1_ptr + x);
- const int16x8_t b = vld1q_s16(input2_ptr + x);
-
- const float32x4x2_t af =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
- }
- };
-
- const float32x4x2_t bf =
- {
- {
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
- vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
- }
- };
-
- const int32x4x2_t rf =
- {
- {
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#else //__aarch64__
- vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
-#endif //__aarch64__
- }
- };
-
- const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
- vst1q_s16(output_ptr + x, pa);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
- const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
- *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
- }
- },
- input1, input2, output);
- }
-}
-} // namespace cpu
-} // namespace arm_compute \ No newline at end of file